From 972452ab43aed83e9812a3c5b15355c837ac1aa3 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 13:21:20 +0000
Subject: [PATCH 01/25] roundscale_round: ps,pd

---
 crates/core_arch/avx512f.md            |  12 +-
 crates/core_arch/src/x86/avx512f.rs    | 226 +++++++++++
 crates/core_arch/src/x86/macros.rs     | 524 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  29 ++
 4 files changed, 785 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 110c31eeea..58be7ab632 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -609,8 +609,8 @@
   * [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236)
   * [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
   * [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
-  * [ ] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
-  * [ ] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
+  * [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
+  * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236)
   * [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
@@ -881,8 +881,8 @@
   * [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236)
   * [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
   * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
-  * [ ] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
-  * [ ] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
+  * [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
+  * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236)
   * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
@@ -1019,8 +1019,8 @@
   * [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236)
   * [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
   * [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
-  * [ ] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
-  * [ ] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
+  * [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
+  * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236)
   * [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 25b91f7d9e..90b707d93c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -4845,6 +4845,196 @@ pub unsafe fn _mm512_maskz_getexp_round_pd(k: __mmask8, a: __m512d, sae: i32) ->
     transmute(r)
 }
 
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_ps&expand=4790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(1, 2)]
+pub unsafe fn _mm512_roundscale_round_ps(a: __m512, imm8: i32, sae: i32) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaleps(
+                a.as_f32x16(),
+                $imm8,
+                _mm512_setzero_ps().as_f32x16(),
+                0b11111111_11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_ps&expand=4788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_ps(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    imm8: i32,
+    sae: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaleps(a.as_f32x16(), $imm8, src.as_f32x16(), k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_ps&expand=4789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_ps(
+    k: __mmask16,
+    a: __m512,
+    imm8: i32,
+    sae: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaleps(
+                a.as_f32x16(),
+                $imm8,
+                _mm512_setzero_ps().as_f32x16(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_pd&expand=4787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(1, 2)]
+pub unsafe fn _mm512_roundscale_round_pd(a: __m512d, imm8: i32, sae: i32) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalepd(
+                a.as_f64x8(),
+                $imm8,
+                _mm512_setzero_pd().as_f64x8(),
+                0b11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_pd&expand=4785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    imm8: i32,
+    sae: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalepd(a.as_f64x8(), $imm8, src.as_f64x8(), k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_pd&expand=4786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_pd(
+    k: __mmask8,
+    a: __m512d,
+    imm8: i32,
+    sae: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalepd(
+                a.as_f64x8(),
+                $imm8,
+                _mm512_setzero_pd().as_f64x8(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -17206,6 +17396,11 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
     fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
 
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
+    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
+    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+
     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -20630,6 +20825,37 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_round_ps(a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_round_ps(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r =
+            _mm512_mask_roundscale_round_ps(a, 0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_round_ps(0, a, 0, _MM_FROUND_CUR_DIRECTION);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r =
+            _mm512_maskz_roundscale_round_ps(0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_ps() {
         let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 891286df4e..0f9d938eaa 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -733,6 +733,530 @@ macro_rules! constify_imm8_sae {
     };
 }
 
+// Two sae parameters.
+// This macro enforces that.
+#[allow(unused)]
+macro_rules! constify_imm8_roundscale {
+    ($imm8:expr, $imm4:expr, $expand:ident) => {
+        #[allow(overflowing_literals)]
+        match ($imm8 & 0b11111111, $imm4) {
+            (0, 4) => $expand!(0, 4),
+            (0, 8) => $expand!(0, 8),
+            (1, 4) => $expand!(1, 4),
+            (1, 8) => $expand!(1, 8),
+            (2, 4) => $expand!(2, 4),
+            (2, 8) => $expand!(2, 8),
+            (3, 4) => $expand!(3, 4),
+            (3, 8) => $expand!(3, 8),
+            (4, 4) => $expand!(4, 4),
+            (4, 8) => $expand!(4, 8),
+            (5, 4) => $expand!(5, 4),
+            (5, 8) => $expand!(5, 8),
+            (6, 4) => $expand!(6, 4),
+            (6, 8) => $expand!(6, 8),
+            (7, 4) => $expand!(7, 4),
+            (7, 8) => $expand!(7, 8),
+            (8, 4) => $expand!(8, 4),
+            (8, 8) => $expand!(8, 8),
+            (9, 4) => $expand!(9, 4),
+            (9, 8) => $expand!(9, 8),
+            (10, 4) => $expand!(10, 4),
+            (10, 8) => $expand!(10, 8),
+            (11, 4) => $expand!(11, 4),
+            (11, 8) => $expand!(11, 8),
+            (12, 4) => $expand!(12, 4),
+            (12, 8) => $expand!(12, 8),
+            (13, 4) => $expand!(13, 4),
+            (13, 8) => $expand!(13, 8),
+            (14, 4) => $expand!(14, 4),
+            (14, 8) => $expand!(14, 8),
+            (15, 4) => $expand!(15, 4),
+            (15, 8) => $expand!(15, 8),
+            (16, 4) => $expand!(16, 4),
+            (16, 8) => $expand!(16, 8),
+            (17, 4) => $expand!(17, 4),
+            (17, 8) => $expand!(17, 8),
+            (18, 4) => $expand!(18, 4),
+            (18, 8) => $expand!(18, 8),
+            (19, 4) => $expand!(19, 4),
+            (19, 8) => $expand!(19, 8),
+            (20, 4) => $expand!(20, 4),
+            (20, 8) => $expand!(20, 8),
+            (21, 4) => $expand!(21, 4),
+            (21, 8) => $expand!(21, 8),
+            (22, 4) => $expand!(22, 4),
+            (22, 8) => $expand!(22, 8),
+            (23, 4) => $expand!(23, 4),
+            (23, 8) => $expand!(23, 8),
+            (24, 4) => $expand!(24, 4),
+            (24, 8) => $expand!(24, 8),
+            (25, 4) => $expand!(25, 4),
+            (25, 8) => $expand!(25, 8),
+            (26, 4) => $expand!(26, 4),
+            (26, 8) => $expand!(26, 8),
+            (27, 4) => $expand!(27, 4),
+            (27, 8) => $expand!(27, 8),
+            (28, 4) => $expand!(28, 4),
+            (28, 8) => $expand!(28, 8),
+            (29, 4) => $expand!(29, 4),
+            (29, 8) => $expand!(29, 8),
+            (30, 4) => $expand!(30, 4),
+            (30, 8) => $expand!(30, 8),
+            (31, 4) => $expand!(31, 4),
+            (31, 8) => $expand!(31, 8),
+            (32, 4) => $expand!(32, 4),
+            (32, 8) => $expand!(32, 8),
+            (33, 4) => $expand!(33, 4),
+            (33, 8) => $expand!(33, 8),
+            (34, 4) => $expand!(34, 4),
+            (34, 8) => $expand!(34, 8),
+            (35, 4) => $expand!(35, 4),
+            (35, 8) => $expand!(35, 8),
+            (36, 4) => $expand!(36, 4),
+            (36, 8) => $expand!(36, 8),
+            (37, 4) => $expand!(37, 4),
+            (37, 8) => $expand!(37, 8),
+            (38, 4) => $expand!(38, 4),
+            (38, 8) => $expand!(38, 8),
+            (39, 4) => $expand!(39, 4),
+            (39, 8) => $expand!(39, 8),
+            (40, 4) => $expand!(40, 4),
+            (40, 8) => $expand!(40, 8),
+            (41, 4) => $expand!(41, 4),
+            (41, 8) => $expand!(41, 8),
+            (42, 4) => $expand!(42, 4),
+            (42, 8) => $expand!(42, 8),
+            (43, 4) => $expand!(43, 4),
+            (43, 8) => $expand!(43, 8),
+            (44, 4) => $expand!(44, 4),
+            (44, 8) => $expand!(44, 8),
+            (45, 4) => $expand!(45, 4),
+            (45, 8) => $expand!(45, 8),
+            (46, 4) => $expand!(46, 4),
+            (46, 8) => $expand!(46, 8),
+            (47, 4) => $expand!(47, 4),
+            (47, 8) => $expand!(47, 8),
+            (48, 4) => $expand!(48, 4),
+            (48, 8) => $expand!(48, 8),
+            (49, 4) => $expand!(49, 4),
+            (49, 8) => $expand!(49, 8),
+            (50, 4) => $expand!(50, 4),
+            (50, 8) => $expand!(50, 8),
+            (51, 4) => $expand!(51, 4),
+            (51, 8) => $expand!(51, 8),
+            (52, 4) => $expand!(52, 4),
+            (52, 8) => $expand!(52, 8),
+            (53, 4) => $expand!(53, 4),
+            (53, 8) => $expand!(53, 8),
+            (54, 4) => $expand!(54, 4),
+            (54, 8) => $expand!(54, 8),
+            (55, 4) => $expand!(55, 4),
+            (55, 8) => $expand!(55, 8),
+            (56, 4) => $expand!(56, 4),
+            (56, 8) => $expand!(56, 8),
+            (57, 4) => $expand!(57, 4),
+            (57, 8) => $expand!(57, 8),
+            (58, 4) => $expand!(58, 4),
+            (58, 8) => $expand!(58, 8),
+            (59, 4) => $expand!(59, 4),
+            (59, 8) => $expand!(59, 8),
+            (60, 4) => $expand!(60, 4),
+            (60, 8) => $expand!(60, 8),
+            (61, 4) => $expand!(61, 4),
+            (61, 8) => $expand!(61, 8),
+            (62, 4) => $expand!(62, 4),
+            (62, 8) => $expand!(62, 8),
+            (63, 4) => $expand!(63, 4),
+            (63, 8) => $expand!(63, 8),
+            (64, 4) => $expand!(64, 4),
+            (64, 8) => $expand!(64, 8),
+            (65, 4) => $expand!(65, 4),
+            (65, 8) => $expand!(65, 8),
+            (66, 4) => $expand!(66, 4),
+            (66, 8) => $expand!(66, 8),
+            (67, 4) => $expand!(67, 4),
+            (67, 8) => $expand!(67, 8),
+            (68, 4) => $expand!(68, 4),
+            (68, 8) => $expand!(68, 8),
+            (69, 4) => $expand!(69, 4),
+            (69, 8) => $expand!(69, 8),
+            (70, 4) => $expand!(70, 4),
+            (70, 8) => $expand!(70, 8),
+            (71, 4) => $expand!(71, 4),
+            (71, 8) => $expand!(71, 8),
+            (72, 4) => $expand!(72, 4),
+            (72, 8) => $expand!(72, 8),
+            (73, 4) => $expand!(73, 4),
+            (73, 8) => $expand!(73, 8),
+            (74, 4) => $expand!(74, 4),
+            (74, 8) => $expand!(74, 8),
+            (75, 4) => $expand!(75, 4),
+            (75, 8) => $expand!(75, 8),
+            (76, 4) => $expand!(76, 4),
+            (76, 8) => $expand!(76, 8),
+            (77, 4) => $expand!(77, 4),
+            (77, 8) => $expand!(77, 8),
+            (78, 4) => $expand!(78, 4),
+            (78, 8) => $expand!(78, 8),
+            (79, 4) => $expand!(79, 4),
+            (79, 8) => $expand!(79, 8),
+            (80, 4) => $expand!(80, 4),
+            (80, 8) => $expand!(80, 8),
+            (81, 4) => $expand!(81, 4),
+            (81, 8) => $expand!(81, 8),
+            (82, 4) => $expand!(82, 4),
+            (82, 8) => $expand!(82, 8),
+            (83, 4) => $expand!(83, 4),
+            (83, 8) => $expand!(83, 8),
+            (84, 4) => $expand!(84, 4),
+            (84, 8) => $expand!(84, 8),
+            (85, 4) => $expand!(85, 4),
+            (85, 8) => $expand!(85, 8),
+            (86, 4) => $expand!(86, 4),
+            (86, 8) => $expand!(86, 8),
+            (87, 4) => $expand!(87, 4),
+            (87, 8) => $expand!(87, 8),
+            (88, 4) => $expand!(88, 4),
+            (88, 8) => $expand!(88, 8),
+            (89, 4) => $expand!(89, 4),
+            (89, 8) => $expand!(89, 8),
+            (90, 4) => $expand!(90, 4),
+            (90, 8) => $expand!(90, 8),
+            (91, 4) => $expand!(91, 4),
+            (91, 8) => $expand!(91, 8),
+            (92, 4) => $expand!(92, 4),
+            (92, 8) => $expand!(92, 8),
+            (93, 4) => $expand!(93, 4),
+            (93, 8) => $expand!(93, 8),
+            (94, 4) => $expand!(94, 4),
+            (94, 8) => $expand!(94, 8),
+            (95, 4) => $expand!(95, 4),
+            (95, 8) => $expand!(95, 8),
+            (96, 4) => $expand!(96, 4),
+            (96, 8) => $expand!(96, 8),
+            (97, 4) => $expand!(97, 4),
+            (97, 8) => $expand!(97, 8),
+            (98, 4) => $expand!(98, 4),
+            (98, 8) => $expand!(98, 8),
+            (99, 4) => $expand!(99, 4),
+            (99, 8) => $expand!(99, 8),
+            (100, 4) => $expand!(100, 4),
+            (100, 8) => $expand!(100, 8),
+            (101, 4) => $expand!(101, 4),
+            (101, 8) => $expand!(101, 8),
+            (102, 4) => $expand!(102, 4),
+            (102, 8) => $expand!(102, 8),
+            (103, 4) => $expand!(103, 4),
+            (103, 8) => $expand!(103, 8),
+            (104, 4) => $expand!(104, 4),
+            (104, 8) => $expand!(104, 8),
+            (105, 4) => $expand!(105, 4),
+            (105, 8) => $expand!(105, 8),
+            (106, 4) => $expand!(106, 4),
+            (106, 8) => $expand!(106, 8),
+            (107, 4) => $expand!(107, 4),
+            (107, 8) => $expand!(107, 8),
+            (108, 4) => $expand!(108, 4),
+            (108, 8) => $expand!(108, 8),
+            (109, 4) => $expand!(109, 4),
+            (109, 8) => $expand!(109, 8),
+            (110, 4) => $expand!(110, 4),
+            (110, 8) => $expand!(110, 8),
+            (111, 4) => $expand!(111, 4),
+            (111, 8) => $expand!(111, 8),
+            (112, 4) => $expand!(112, 4),
+            (112, 8) => $expand!(112, 8),
+            (113, 4) => $expand!(113, 4),
+            (113, 8) => $expand!(113, 8),
+            (114, 4) => $expand!(114, 4),
+            (114, 8) => $expand!(114, 8),
+            (115, 4) => $expand!(115, 4),
+            (115, 8) => $expand!(115, 8),
+            (116, 4) => $expand!(116, 4),
+            (116, 8) => $expand!(116, 8),
+            (117, 4) => $expand!(117, 4),
+            (117, 8) => $expand!(117, 8),
+            (118, 4) => $expand!(118, 4),
+            (118, 8) => $expand!(118, 8),
+            (119, 4) => $expand!(119, 4),
+            (119, 8) => $expand!(119, 8),
+            (120, 4) => $expand!(120, 4),
+            (120, 8) => $expand!(120, 8),
+            (121, 4) => $expand!(121, 4),
+            (121, 8) => $expand!(121, 8),
+            (122, 4) => $expand!(122, 4),
+            (122, 8) => $expand!(122, 8),
+            (123, 4) => $expand!(123, 4),
+            (123, 8) => $expand!(123, 8),
+            (124, 4) => $expand!(124, 4),
+            (124, 8) => $expand!(124, 8),
+            (125, 4) => $expand!(125, 4),
+            (125, 8) => $expand!(125, 8),
+            (126, 4) => $expand!(126, 4),
+            (126, 8) => $expand!(126, 8),
+            (127, 4) => $expand!(127, 4),
+            (127, 8) => $expand!(127, 8),
+            (128, 4) => $expand!(128, 4),
+            (128, 8) => $expand!(128, 8),
+            (129, 4) => $expand!(129, 4),
+            (129, 8) => $expand!(129, 8),
+            (130, 4) => $expand!(130, 4),
+            (130, 8) => $expand!(130, 8),
+            (131, 4) => $expand!(131, 4),
+            (131, 8) => $expand!(131, 8),
+            (132, 4) => $expand!(132, 4),
+            (132, 8) => $expand!(132, 8),
+            (133, 4) => $expand!(133, 4),
+            (133, 8) => $expand!(133, 8),
+            (134, 4) => $expand!(134, 4),
+            (134, 8) => $expand!(134, 8),
+            (135, 4) => $expand!(135, 4),
+            (135, 8) => $expand!(135, 8),
+            (136, 4) => $expand!(136, 4),
+            (136, 8) => $expand!(136, 8),
+            (137, 4) => $expand!(137, 4),
+            (137, 8) => $expand!(137, 8),
+            (138, 4) => $expand!(138, 4),
+            (138, 8) => $expand!(138, 8),
+            (139, 4) => $expand!(139, 4),
+            (139, 8) => $expand!(139, 8),
+            (140, 4) => $expand!(140, 4),
+            (140, 8) => $expand!(140, 8),
+            (141, 4) => $expand!(141, 4),
+            (141, 8) => $expand!(141, 8),
+            (142, 4) => $expand!(142, 4),
+            (142, 8) => $expand!(142, 8),
+            (143, 4) => $expand!(143, 4),
+            (143, 8) => $expand!(143, 8),
+            (144, 4) => $expand!(144, 4),
+            (144, 8) => $expand!(144, 8),
+            (145, 4) => $expand!(145, 4),
+            (145, 8) => $expand!(145, 8),
+            (146, 4) => $expand!(146, 4),
+            (146, 8) => $expand!(146, 8),
+            (147, 4) => $expand!(147, 4),
+            (147, 8) => $expand!(147, 8),
+            (148, 4) => $expand!(148, 4),
+            (148, 8) => $expand!(148, 8),
+            (149, 4) => $expand!(149, 4),
+            (149, 8) => $expand!(149, 8),
+            (150, 4) => $expand!(150, 4),
+            (150, 8) => $expand!(150, 8),
+            (151, 4) => $expand!(151, 4),
+            (151, 8) => $expand!(151, 8),
+            (152, 4) => $expand!(152, 4),
+            (152, 8) => $expand!(152, 8),
+            (153, 4) => $expand!(153, 4),
+            (153, 8) => $expand!(153, 8),
+            (154, 4) => $expand!(154, 4),
+            (154, 8) => $expand!(154, 8),
+            (155, 4) => $expand!(155, 4),
+            (155, 8) => $expand!(155, 8),
+            (156, 4) => $expand!(156, 4),
+            (156, 8) => $expand!(156, 8),
+            (157, 4) => $expand!(157, 4),
+            (157, 8) => $expand!(157, 8),
+            (158, 4) => $expand!(158, 4),
+            (158, 8) => $expand!(158, 8),
+            (159, 4) => $expand!(159, 4),
+            (159, 8) => $expand!(159, 8),
+            (160, 4) => $expand!(160, 4),
+            (160, 8) => $expand!(160, 8),
+            (161, 4) => $expand!(161, 4),
+            (161, 8) => $expand!(161, 8),
+            (162, 4) => $expand!(162, 4),
+            (162, 8) => $expand!(162, 8),
+            (163, 4) => $expand!(163, 4),
+            (163, 8) => $expand!(163, 8),
+            (164, 4) => $expand!(164, 4),
+            (164, 8) => $expand!(164, 8),
+            (165, 4) => $expand!(165, 4),
+            (165, 8) => $expand!(165, 8),
+            (166, 4) => $expand!(166, 4),
+            (166, 8) => $expand!(166, 8),
+            (167, 4) => $expand!(167, 4),
+            (167, 8) => $expand!(167, 8),
+            (168, 4) => $expand!(168, 4),
+            (168, 8) => $expand!(168, 8),
+            (169, 4) => $expand!(169, 4),
+            (169, 8) => $expand!(169, 8),
+            (170, 4) => $expand!(170, 4),
+            (170, 8) => $expand!(170, 8),
+            (171, 4) => $expand!(171, 4),
+            (171, 8) => $expand!(171, 8),
+            (172, 4) => $expand!(172, 4),
+            (172, 8) => $expand!(172, 8),
+            (173, 4) => $expand!(173, 4),
+            (173, 8) => $expand!(173, 8),
+            (174, 4) => $expand!(174, 4),
+            (174, 8) => $expand!(174, 8),
+            (175, 4) => $expand!(175, 4),
+            (175, 8) => $expand!(175, 8),
+            (176, 4) => $expand!(176, 4),
+            (176, 8) => $expand!(176, 8),
+            (177, 4) => $expand!(177, 4),
+            (177, 8) => $expand!(177, 8),
+            (178, 4) => $expand!(178, 4),
+            (178, 8) => $expand!(178, 8),
+            (179, 4) => $expand!(179, 4),
+            (179, 8) => $expand!(179, 8),
+            (180, 4) => $expand!(180, 4),
+            (180, 8) => $expand!(180, 8),
+            (181, 4) => $expand!(181, 4),
+            (181, 8) => $expand!(181, 8),
+            (182, 4) => $expand!(182, 4),
+            (182, 8) => $expand!(182, 8),
+            (183, 4) => $expand!(183, 4),
+            (183, 8) => $expand!(183, 8),
+            (184, 4) => $expand!(184, 4),
+            (184, 8) => $expand!(184, 8),
+            (185, 4) => $expand!(185, 4),
+            (185, 8) => $expand!(185, 8),
+            (186, 4) => $expand!(186, 4),
+            (186, 8) => $expand!(186, 8),
+            (187, 4) => $expand!(187, 4),
+            (187, 8) => $expand!(187, 8),
+            (188, 4) => $expand!(188, 4),
+            (188, 8) => $expand!(188, 8),
+            (189, 4) => $expand!(189, 4),
+            (189, 8) => $expand!(189, 8),
+            (190, 4) => $expand!(190, 4),
+            (190, 8) => $expand!(190, 8),
+            (191, 4) => $expand!(191, 4),
+            (191, 8) => $expand!(191, 8),
+            (192, 4) => $expand!(192, 4),
+            (192, 8) => $expand!(192, 8),
+            (193, 4) => $expand!(193, 4),
+            (193, 8) => $expand!(193, 8),
+            (194, 4) => $expand!(194, 4),
+            (194, 8) => $expand!(194, 8),
+            (195, 4) => $expand!(195, 4),
+            (195, 8) => $expand!(195, 8),
+            (196, 4) => $expand!(196, 4),
+            (196, 8) => $expand!(196, 8),
+            (197, 4) => $expand!(197, 4),
+            (197, 8) => $expand!(197, 8),
+            (198, 4) => $expand!(198, 4),
+            (198, 8) => $expand!(198, 8),
+            (199, 4) => $expand!(199, 4),
+            (199, 8) => $expand!(199, 8),
+            (200, 4) => $expand!(200, 4),
+            (200, 8) => $expand!(200, 8),
+            (201, 4) => $expand!(201, 4),
+            (201, 8) => $expand!(201, 8),
+            (202, 4) => $expand!(202, 4),
+            (202, 8) => $expand!(202, 8),
+            (203, 4) => $expand!(203, 4),
+            (203, 8) => $expand!(203, 8),
+            (204, 4) => $expand!(204, 4),
+            (204, 8) => $expand!(204, 8),
+            (205, 4) => $expand!(205, 4),
+            (205, 8) => $expand!(205, 8),
+            (206, 4) => $expand!(206, 4),
+            (206, 8) => $expand!(206, 8),
+            (207, 4) => $expand!(207, 4),
+            (207, 8) => $expand!(207, 8),
+            (208, 4) => $expand!(208, 4),
+            (208, 8) => $expand!(208, 8),
+            (209, 4) => $expand!(209, 4),
+            (209, 8) => $expand!(209, 8),
+            (210, 4) => $expand!(210, 4),
+            (210, 8) => $expand!(210, 8),
+            (211, 4) => $expand!(211, 4),
+            (211, 8) => $expand!(211, 8),
+            (212, 4) => $expand!(212, 4),
+            (212, 8) => $expand!(212, 8),
+            (213, 4) => $expand!(213, 4),
+            (213, 8) => $expand!(213, 8),
+            (214, 4) => $expand!(214, 4),
+            (214, 8) => $expand!(214, 8),
+            (215, 4) => $expand!(215, 4),
+            (215, 8) => $expand!(215, 8),
+            (216, 4) => $expand!(216, 4),
+            (216, 8) => $expand!(216, 8),
+            (217, 4) => $expand!(217, 4),
+            (217, 8) => $expand!(217, 8),
+            (218, 4) => $expand!(218, 4),
+            (218, 8) => $expand!(218, 8),
+            (219, 4) => $expand!(219, 4),
+            (219, 8) => $expand!(219, 8),
+            (220, 4) => $expand!(220, 4),
+            (220, 8) => $expand!(220, 8),
+            (221, 4) => $expand!(221, 4),
+            (221, 8) => $expand!(221, 8),
+            (222, 4) => $expand!(222, 4),
+            (222, 8) => $expand!(222, 8),
+            (223, 4) => $expand!(223, 4),
+            (223, 8) => $expand!(223, 8),
+            (224, 4) => $expand!(224, 4),
+            (224, 8) => $expand!(224, 8),
+            (225, 4) => $expand!(225, 4),
+            (225, 8) => $expand!(225, 8),
+            (226, 4) => $expand!(226, 4),
+            (226, 8) => $expand!(226, 8),
+            (227, 4) => $expand!(227, 4),
+            (227, 8) => $expand!(227, 8),
+            (228, 4) => $expand!(228, 4),
+            (228, 8) => $expand!(228, 8),
+            (229, 4) => $expand!(229, 4),
+            (229, 8) => $expand!(229, 8),
+            (230, 4) => $expand!(230, 4),
+            (230, 8) => $expand!(230, 8),
+            (231, 4) => $expand!(231, 4),
+            (231, 8) => $expand!(231, 8),
+            (232, 4) => $expand!(232, 4),
+            (232, 8) => $expand!(232, 8),
+            (233, 4) => $expand!(233, 4),
+            (233, 8) => $expand!(233, 8),
+            (234, 4) => $expand!(234, 4),
+            (234, 8) => $expand!(234, 8),
+            (235, 4) => $expand!(235, 4),
+            (235, 8) => $expand!(235, 8),
+            (236, 4) => $expand!(236, 4),
+            (236, 8) => $expand!(236, 8),
+            (237, 4) => $expand!(237, 4),
+            (237, 8) => $expand!(237, 8),
+            (238, 4) => $expand!(238, 4),
+            (238, 8) => $expand!(238, 8),
+            (239, 4) => $expand!(239, 4),
+            (239, 8) => $expand!(239, 8),
+            (240, 4) => $expand!(240, 4),
+            (240, 8) => $expand!(240, 8),
+            (241, 4) => $expand!(241, 4),
+            (241, 8) => $expand!(241, 8),
+            (242, 4) => $expand!(242, 4),
+            (242, 8) => $expand!(242, 8),
+            (243, 4) => $expand!(243, 4),
+            (243, 8) => $expand!(243, 8),
+            (244, 4) => $expand!(244, 4),
+            (244, 8) => $expand!(244, 8),
+            (245, 4) => $expand!(245, 4),
+            (245, 8) => $expand!(245, 8),
+            (246, 4) => $expand!(246, 4),
+            (246, 8) => $expand!(246, 8),
+            (247, 4) => $expand!(247, 4),
+            (247, 8) => $expand!(247, 8),
+            (248, 4) => $expand!(248, 4),
+            (248, 8) => $expand!(248, 8),
+            (249, 4) => $expand!(249, 4),
+            (249, 8) => $expand!(249, 8),
+            (250, 4) => $expand!(250, 4),
+            (250, 8) => $expand!(250, 8),
+            (251, 4) => $expand!(251, 4),
+            (251, 8) => $expand!(251, 8),
+            (252, 4) => $expand!(252, 4),
+            (252, 8) => $expand!(252, 8),
+            (253, 4) => $expand!(253, 4),
+            (253, 8) => $expand!(253, 8),
+            (254, 4) => $expand!(254, 4),
+            (254, 8) => $expand!(254, 8),
+            (255, 4) => $expand!(255, 4),
+            (255, 8) => $expand!(255, 8),
+            (_, _) => panic!("Invalid sae value"),
+        }
+    };
+}
+
 #[cfg(test)]
 macro_rules! assert_approx_eq {
     ($a:expr, $b:expr, $eps:expr) => {{
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 1eca091c6d..6efe80b62a 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -2686,6 +2686,35 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_round_pd(a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_round_pd(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_round_pd(a, 0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_round_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_round_pd(0, a, 0, _MM_FROUND_CUR_DIRECTION);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_round_pd(0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_pd() {
         let a = _mm512_set1_pd(10.);

From 751acda5253fc0dcf3e20c1c6cf1927bb5c0c0bd Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 13:37:24 +0000
Subject: [PATCH 02/25] roundscale: ps,pd

---
 crates/core_arch/avx512f.md            |  12 +-
 crates/core_arch/src/x86/avx512f.rs    | 208 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  29 ++++
 3 files changed, 243 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 58be7ab632..50f8179955 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -607,8 +607,8 @@
   * [x] [`_mm512_mask_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=5236)
   * [x] [`_mm512_mask_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi32&expand=5236)
   * [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236)
-  * [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
-  * [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
+  * [x] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
+  * [x] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
   * [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
   * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
@@ -879,8 +879,8 @@
   * [x] [`_mm512_maskz_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=5236)
   * [x] [`_mm512_maskz_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi32&expand=5236)
   * [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
-  * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
+  * [x] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
+  * [x] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
   * [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
   * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
@@ -1017,8 +1017,8 @@
   * [x] [`_mm512_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=5236)
   * [x] [`_mm512_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi32&expand=5236)
   * [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236)
-  * [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
-  * [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
+  * [x] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
+  * [x] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
   * [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
   * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 90b707d93c..1077f18c44 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1969,6 +1969,185 @@ pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
     ))
 }
 
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_ps&expand=4784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm512_roundscale_ps(a: __m512, imm8: i32) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaleps(
+                a.as_f32x16(),
+                $imm8,
+                _mm512_setzero_ps().as_f32x16(),
+                0b11111111_11111111,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_ps&expand=4782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_mask_roundscale_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaleps(
+                a.as_f32x16(),
+                $imm8,
+                src.as_f32x16(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_ps&expand=4783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_maskz_roundscale_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaleps(
+                a.as_f32x16(),
+                $imm8,
+                _mm512_setzero_ps().as_f32x16(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_pd&expand=4775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm512_roundscale_pd(a: __m512d, imm8: i32) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalepd(
+                a.as_f64x8(),
+                $imm8,
+                _mm512_setzero_pd().as_f64x8(),
+                0b11111111,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_pd&expand=4773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_mask_roundscale_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    imm8: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalepd(
+                a.as_f64x8(),
+                $imm8,
+                src.as_f64x8(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_pd&expand=4774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalepd(
+                a.as_f64x8(),
+                $imm8,
+                _mm512_setzero_pd().as_f64x8(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -19470,6 +19649,35 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_roundscale_ps(a, 0);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_mask_roundscale_ps(a, 0, a, 0);
+        let e = _mm512_set1_ps(1.1);
+        assert_eq_m512(r, e);
+        let r = _mm512_mask_roundscale_ps(a, 0b11111111_11111111, a, 0);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_ps() {
+        let a = _mm512_set1_ps(1.1);
+        let r = _mm512_maskz_roundscale_ps(0, a, 0);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_roundscale_ps(0b11111111_11111111, a, 0);
+        let e = _mm512_set1_ps(1.0);
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_ps() {
         let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 6efe80b62a..b653a55bcb 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -995,6 +995,35 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_roundscale_pd(a, 0);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_mask_roundscale_pd(a, 0, a, 0);
+        let e = _mm512_set1_pd(1.1);
+        assert_eq_m512d(r, e);
+        let r = _mm512_mask_roundscale_pd(a, 0b11111111, a, 0);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_roundscale_pd() {
+        let a = _mm512_set1_pd(1.1);
+        let r = _mm512_maskz_roundscale_pd(0, a, 0);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_roundscale_pd(0b11111111, a, 0);
+        let e = _mm512_set1_pd(1.0);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_pd() {
         let a = _mm512_set1_pd(10.);

From b9800ad39407eeb105fa5cb5f9475ae3af5ed0c7 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 14:07:20 +0000
Subject: [PATCH 03/25] scalef_round: ps,pd; scalef: ps,pd

---
 crates/core_arch/avx512f.md            |  24 +-
 crates/core_arch/src/x86/avx512f.rs    | 373 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  75 +++++
 3 files changed, 460 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 50f8179955..ab9e4d9545 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -613,10 +613,10 @@
   * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236)
-  * [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
-  * [ ] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
-  * [ ] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
-  * [ ] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
+  * [x] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
+  * [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
+  * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
+  * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
   * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
   * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
   * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
@@ -885,10 +885,10 @@
   * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236)
-  * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
-  * [ ] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
-  * [ ] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
-  * [ ] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
+  * [x] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
+  * [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
+  * [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
+  * [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
   * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
   * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
   * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236)
@@ -1023,10 +1023,10 @@
   * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
   * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
   * [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236)
-  * [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
-  * [ ] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236)
-  * [ ] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236)
-  * [ ] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236)
+  * [x] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
+  * [x] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236)
+  * [x] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236)
+  * [x] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236)
   * [x] [`_mm512_set1_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=5236)
   * [x] [`_mm512_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32&expand=5236)
   * [x] [`_mm512_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 1077f18c44..0a91c92706 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2148,6 +2148,102 @@ pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) ->
     transmute(r)
 }
 
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_ps&expand=4883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        0b11111111_11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_ps&expand=4881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        src.as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_ps&expand=4882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+    transmute(vscalefps(
+        a.as_f32x16(),
+        b.as_f32x16(),
+        _mm512_setzero_ps().as_f32x16(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_pd&expand=4874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_pd&expand=4872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        src.as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_pd&expand=4873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+    transmute(vscalefpd(
+        a.as_f64x8(),
+        b.as_f64x8(),
+        _mm512_setzero_pd().as_f64x8(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -5214,6 +5310,196 @@ pub unsafe fn _mm512_maskz_roundscale_round_pd(
     transmute(r)
 }
 
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_ps&expand=4889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_scalef_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                _mm512_setzero_ps().as_f32x16(),
+                0b11111111_11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_ps&expand=4887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_scalef_round_ps(
+    src: __m512,
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    rounding: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefps(a.as_f32x16(), b.as_f32x16(), src.as_f32x16(), k, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_ps&expand=4888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_scalef_round_ps(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    rounding: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                _mm512_setzero_ps().as_f32x16(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_pd&expand=4886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_scalef_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefpd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                _mm512_setzero_pd().as_f64x8(),
+                0b11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_pd&expand=4884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_scalef_round_pd(
+    src: __m512d,
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    rounding: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefpd(a.as_f64x8(), b.as_f64x8(), src.as_f64x8(), k, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_pd&expand=4885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_scalef_round_pd(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    rounding: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefpd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                _mm512_setzero_pd().as_f64x8(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -17579,6 +17865,10 @@ extern "C" {
     fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
     fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
+    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
+    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
 
     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
@@ -19678,6 +19968,41 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_ps(a, b);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_mask_scalef_ps(a, 0, a, b);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_maskz_scalef_ps(0, a, b);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_ps() {
         let a = _mm512_set1_ps(10.);
@@ -21064,6 +21389,54 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r = _mm512_scalef_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm512_set1_ps(8.);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r =
+            _mm512_mask_scalef_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m512(r, a);
+        let r = _mm512_mask_scalef_round_ps(
+            a,
+            0b11111111_00000000,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_ps() {
+        let a = _mm512_set1_ps(1.);
+        let b = _mm512_set1_ps(3.);
+        let r =
+            _mm512_maskz_scalef_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_scalef_round_ps(
+            0b11111111_00000000,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm512_set_ps(
+            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_ps() {
         let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b653a55bcb..f6f80a323c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1024,6 +1024,37 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_pd(a, b);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_mask_scalef_pd(a, 0, a, b);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_pd(a, 0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_maskz_scalef_pd(0, a, b);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_pd(0b11110000, a, b);
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_pd() {
         let a = _mm512_set1_pd(10.);
@@ -2744,6 +2775,50 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r = _mm512_scalef_round_pd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm512_set1_pd(8.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r =
+            _mm512_mask_scalef_round_pd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m512d(r, a);
+        let r = _mm512_mask_scalef_round_pd(
+            a,
+            0b11110000,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_scalef_round_pd() {
+        let a = _mm512_set1_pd(1.);
+        let b = _mm512_set1_pd(3.);
+        let r =
+            _mm512_maskz_scalef_round_pd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_scalef_round_pd(
+            0b11110000,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_pd() {
         let a = _mm512_set1_pd(10.);

From a6274653d84895583c23f9e84ef853144ef59b67 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 15:49:39 +0000
Subject: [PATCH 04/25] reduce_mul: epi64, reduce_max: epi64,epu64, reduce_min:
 epi64,epu64, reduce_and: epi64, reduce_or: epi64

---
 crates/core_arch/avx512f.md              |  32 ++--
 crates/core_arch/src/x86/avx512f.rs      | 178 ++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs   | 105 +++++++++++++
 crates/stdarch-verify/tests/x86-intel.rs |  17 ++-
 4 files changed, 310 insertions(+), 22 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ab9e4d9545..4184997d23 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -576,29 +576,29 @@
   * [x] [`_mm512_mask_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=5236)
   * [x] [`_mm512_mask_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=5236)
   * [x] [`_mm512_mask_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236)
   * [x] [`_mm512_mask_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_pd&expand=5236)
   * [x] [`_mm512_mask_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_ps&expand=5236)
   * [x] [`_mm512_mask_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236)
   * [x] [`_mm512_mask_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236)
   * [x] [`_mm512_mask_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236)
+  * [x] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236)
   * [x] [`_mm512_mask_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_pd&expand=5236)
   * [x] [`_mm512_mask_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_ps&expand=5236)
   * [x] [`_mm512_mask_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236)
   * [x] [`_mm512_mask_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236)
+  * [x] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236)
   * [x] [`_mm512_mask_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_pd&expand=5236)
   * [x] [`_mm512_mask_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_ps&expand=5236)
   * [x] [`_mm512_mask_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236)
   * [x] [`_mm512_mask_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_pd&expand=5236)
   * [x] [`_mm512_mask_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_ps&expand=5236)
   * [x] [`_mm512_mask_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi32&expand=5236)
-  * [ ] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236)
+  * [x] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236)
   * [x] [`_mm512_mask_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=5236)
   * [x] [`_mm512_mask_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=5236)
   * [x] [`_mm512_mask_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi32&expand=5236)
@@ -986,29 +986,29 @@
   * [x] [`_mm512_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=5236)
   * [x] [`_mm512_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=5236)
   * [x] [`_mm512_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236)
+  * [x] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236)
   * [x] [`_mm512_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=5236)
   * [x] [`_mm512_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ps&expand=5236)
   * [x] [`_mm512_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236)
+  * [x] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236)
   * [x] [`_mm512_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236)
+  * [x] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236)
   * [x] [`_mm512_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu32&expand=5236)
-  * [ ] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236)
+  * [x] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236)
   * [x] [`_mm512_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_pd&expand=5236)
   * [x] [`_mm512_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ps&expand=5236)
   * [x] [`_mm512_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236)
+  * [x] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236)
   * [x] [`_mm512_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu32&expand=5236)
-  * [ ] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236)
+  * [x] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236)
   * [x] [`_mm512_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_pd&expand=5236)
   * [x] [`_mm512_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ps&expand=5236)
   * [x] [`_mm512_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236)
+  * [x] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236)
   * [x] [`_mm512_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_pd&expand=5236)
   * [x] [`_mm512_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ps&expand=5236)
   * [x] [`_mm512_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi32&expand=5236)
-  * [ ] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236)
+  * [x] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236)
   * [x] [`_mm512_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=5236)
   * [x] [`_mm512_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=5236)
   * [x] [`_mm512_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi32&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 0a91c92706..95c3794b09 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -16843,6 +16843,19 @@ pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
     simd_reduce_add_unordered(a.as_i32x16())
 }
 
+/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+    simd_reduce_add_unordered(simd_select_bitmask(
+        k,
+        a.as_i32x16(),
+        _mm512_setzero_si512().as_i32x16(),
+    ))
+}
+
 /// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_epi64&expand=4558)
@@ -16852,16 +16865,16 @@ pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
     simd_reduce_add_unordered(a.as_i64x8())
 }
 
-/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
 #[inline]
 #[target_feature(enable = "avx512f")]
-pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
     simd_reduce_add_unordered(simd_select_bitmask(
         k,
-        a.as_i32x16(),
-        _mm512_setzero_si512().as_i32x16(),
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
     ))
 }
 
@@ -16931,6 +16944,28 @@ pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
     ))
 }
 
+/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_epi64&expand=4602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+    simd_reduce_mul_unordered(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_epi64&expand=4601)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_mul_unordered(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(1).as_i64x8(),
+    ))
+}
+
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_ps&expand=4606)
@@ -16997,6 +17032,28 @@ pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
     ))
 }
 
+/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epi64&expand=4578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+    simd_reduce_max(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epi64&expand=4577)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(0).as_i64x8(),
+    ))
+}
+
 /// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu32&expand=4580)
@@ -17019,6 +17076,28 @@ pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
     ))
 }
 
+/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu64&expand=4582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+    simd_reduce_max(a.as_u64x8())
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epu64&expand=4581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+    simd_reduce_max(simd_select_bitmask(
+        k,
+        a.as_u64x8(),
+        _mm512_set1_epi64(0).as_u64x8(),
+    ))
+}
+
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_ps&expand=4586)
@@ -17085,6 +17164,28 @@ pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
     ))
 }
 
+/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epi64&expand=4590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+    simd_reduce_min(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(0).as_i64x8(),
+    ))
+}
+
 /// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu32&expand=4592)
@@ -17107,6 +17208,28 @@ pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
     ))
 }
 
+/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu64&expand=4594)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+    simd_reduce_min(a.as_u64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+    simd_reduce_min(simd_select_bitmask(
+        k,
+        a.as_u64x8(),
+        _mm512_set1_epi64(0).as_u64x8(),
+    ))
+}
+
 /// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_ps&expand=4598)
@@ -17191,6 +17314,29 @@ pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
     ))
 }
 
+/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_and_epi64&expand=4566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+    simd_reduce_and(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_and(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7)
+            .as_i64x8(),
+    ))
+}
+
 /// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi32&expand=4608)
@@ -17213,6 +17359,28 @@ pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
     ))
 }
 
+/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi64&expand=4610)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+    simd_reduce_or(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_or_epi64&expand=4609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+    simd_reduce_or(simd_select_bitmask(
+        k,
+        a.as_i64x8(),
+        _mm512_setzero_si512().as_i64x8(),
+    ))
+}
+
 /// Returns vector of type `__m512d` with undefined elements.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index f6f80a323c..92c300ca17 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -5814,6 +5814,13 @@ mod tests {
         assert_eq!(8, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_add_epi64() {
+        let a = _mm512_set1_epi64(1);
+        let e: i64 = _mm512_mask_reduce_add_epi64(0b11110000, a);
+        assert_eq!(4, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_add_pd() {
         let a = _mm512_set1_pd(1.);
@@ -5828,6 +5835,20 @@ mod tests {
         assert_eq!(4., e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_reduce_mul_epi64(a);
+        assert_eq!(256, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_mul_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let e: i64 = _mm512_mask_reduce_mul_epi64(0b11110000, a);
+        assert_eq!(16, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_mul_pd() {
         let a = _mm512_set1_pd(2.);
@@ -5842,6 +5863,34 @@ mod tests {
         assert_eq!(16., e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_max_epi64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_max_epi64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_max_epu64(a);
+        assert_eq!(7, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_max_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_max_epu64(0b11110000, a);
+        assert_eq!(3, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_max_pd() {
         let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -5856,6 +5905,34 @@ mod tests {
         assert_eq!(3., e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_reduce_min_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epi64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: i64 = _mm512_mask_reduce_min_epi64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_reduce_min_epu64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_min_epu64() {
+        let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+        let e: u64 = _mm512_mask_reduce_min_epu64(0b11110000, a);
+        assert_eq!(0, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_min_pd() {
         let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -5870,6 +5947,34 @@ mod tests {
         assert_eq!(0., e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_and_epi64(a);
+        assert_eq!(0, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_and_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_and_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_reduce_or_epi64(a);
+        assert_eq!(3, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_reduce_or_epi64() {
+        let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+        let e: i64 = _mm512_mask_reduce_or_epi64(0b11110000, a);
+        assert_eq!(1, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_extractf64x4_pd() {
         let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index b9aba9461a..d5096a6903 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -577,7 +577,22 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
         | "_mm512_setr4_epi64"
         | "_mm512_set_epi64"
         | "_mm512_setr_epi64"
-        | "_mm512_reduce_add_epi64" => true,
+        | "_mm512_reduce_add_epi64"
+        | "_mm512_mask_reduce_add_epi64"
+        | "_mm512_reduce_mul_epi64"
+        | "_mm512_mask_reduce_mul_epi64"
+        | "_mm512_reduce_max_epi64"
+        | "_mm512_mask_reduce_max_epi64"
+        | "_mm512_reduce_max_epu64"
+        | "_mm512_mask_reduce_max_epu64"
+        | "_mm512_reduce_min_epi64"
+        | "_mm512_mask_reduce_min_epi64"
+        | "_mm512_reduce_min_epu64"
+        | "_mm512_mask_reduce_min_epu64"
+        | "_mm512_reduce_and_epi64"
+        | "_mm512_mask_reduce_and_epi64"
+        | "_mm512_reduce_or_epi64"
+        | "_mm512_mask_reduce_or_epi64" => true,
 
         // These return a 64-bit argument but they're assembled from other
         // 32-bit registers, so these work on 32-bit just fine. See #308 for

From 1f662e9c324a330b44b5af2080ef4037e890cc0b Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 17:41:25 +0000
Subject: [PATCH 05/25] fixupimm_round_ps

---
 crates/core_arch/avx512f.md         |   2 +-
 crates/core_arch/src/x86/avx512f.rs | 176 ++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 4184997d23..742fd7bda1 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -162,7 +162,7 @@
   * [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
   * [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
   * [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
-  * [ ] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
+  * [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
   * [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236)
   * [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236)
   * [x] [`_mm512_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 95c3794b09..38b3177e56 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -5500,6 +5500,95 @@ pub unsafe fn _mm512_maskz_scalef_round_pd(
     transmute(r)
 }
 
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_ps&expand=2505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_ps(
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                c.as_i32x16(),
+                $imm8,
+                0b11111111_11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_ps&expand=2506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_ps(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmps(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_ps&expand=2507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_ps(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                c.as_i32x16(),
+                $imm8,
+                0b11111111_11111111,
+                $imm4,
+            )
+        };
+    }
+    let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call);
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -18038,6 +18127,11 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
     fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
 
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
+    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
+    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+
     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -21605,6 +21699,88 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_fixupimm_round_ps(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_ps() {
+        let a = _mm512_set_ps(
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_round_ps(
+            a,
+            0b11111111_00000000,
+            b,
+            c,
+            5,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+        let a = _mm512_set_ps(
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_round_ps(
+            0b11111111_00000000,
+            a,
+            b,
+            c,
+            5,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_ps() {
         let a = _mm512_set1_ps(10.);

From 942481b419976f19dbb61ba9c86c2b2b742a1d88 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 20:19:48 +0000
Subject: [PATCH 06/25] fixupimm_round_pd

---
 crates/core_arch/avx512f.md            |  2 +-
 crates/core_arch/src/x86/avx512f.rs    | 93 +++++++++++++++++++++++---
 crates/core_arch/src/x86_64/avx512f.rs | 20 ++++++
 3 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 742fd7bda1..ab7519d643 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -778,7 +778,7 @@
   * [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
   * [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
   * [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
-  * [ ] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
+  * [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
   * [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236)
   * [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236)
   * [x] [`_mm512_maskz_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 38b3177e56..a08845476d 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -5574,19 +5574,92 @@ pub unsafe fn _mm512_maskz_fixupimm_round_ps(
 ) -> __m512 {
     macro_rules! call {
         ($imm8:expr, $imm4:expr) => {
-            vfixupimmps(
-                a.as_f32x16(),
-                b.as_f32x16(),
-                c.as_i32x16(),
+            vfixupimmpsz(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_pd&expand=2502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_pd(
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmpd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                c.as_i64x8(),
                 $imm8,
-                0b11111111_11111111,
+                0b11111111,
                 $imm4,
             )
         };
     }
-    let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call);
-    let zero = _mm512_setzero_ps().as_f32x16();
-    transmute(simd_select_bitmask(k, r, zero))
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_pd&expand=2503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_pd(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmpd(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_pd&expand=2504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_pd(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+    imm8: i32,
+    sae: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vfixupimmpdz(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
 }
 
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
@@ -18131,6 +18204,10 @@ extern "C" {
     fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
     fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
+    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
+    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
 
     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 92c300ca17..bb359f1e38 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1055,6 +1055,26 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_pd() {
         let a = _mm512_set1_pd(10.);

From 66e2d9b59d18fd6fe8cb533bb9fc603f2a92114c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 22:35:17 +0000
Subject: [PATCH 07/25] fixupimm: ps,pd

---
 crates/core_arch/avx512f.md            |  20 +--
 crates/core_arch/src/x86/avx512f.rs    | 236 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  48 ++++-
 3 files changed, 290 insertions(+), 14 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ab7519d643..eafcd9291b 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -159,9 +159,9 @@
   * [x] [`_mm512_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=5236)
   * [x] [`_mm512_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=5236)
   * [x] [`_mm512_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=5236)
-  * [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
-  * [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
-  * [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
+  * [x] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
+  * [x] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
+  * [x] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
   * [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
   * [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236)
   * [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236)
@@ -450,10 +450,10 @@
   * [x] [`_mm512_mask_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf64x4_pd&expand=5236)
   * [x] [`_mm512_mask_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=5236)
   * [x] [`_mm512_mask_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=5236)
-  * [ ] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236)
-  * [ ] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236)
-  * [ ] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236)
-  * [ ] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236)
+  * [x] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236)
+  * [x] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236)
+  * [x] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236)
+  * [x] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236)
   * [x] [`_mm512_mask_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pd&expand=5236)
   * [x] [`_mm512_mask_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ps&expand=5236)
   * [x] [`_mm512_mask_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pd&expand=5236)
@@ -775,9 +775,9 @@
   * [x] [`_mm512_maskz_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf64x4_pd&expand=5236)
   * [x] [`_mm512_maskz_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=5236)
   * [x] [`_mm512_maskz_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
-  * [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
-  * [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
+  * [x] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
+  * [x] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
+  * [x] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
   * [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
   * [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236)
   * [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a08845476d..164630ff51 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2244,6 +2244,174 @@ pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m
     ))
 }
 
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_ps&expand=2499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_fixupimm_ps(a: __m512, b: __m512, c: __m512i, imm8: i32) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                c.as_i32x16(),
+                $imm8,
+                0b11111111_11111111,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_ps&expand=2500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_fixupimm_ps(
+    a: __m512,
+    k: __mmask16,
+    b: __m512,
+    c: __m512i,
+    imm8: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmps(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                c.as_i32x16(),
+                $imm8,
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_ps&expand=2501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_fixupimm_ps(
+    k: __mmask16,
+    a: __m512,
+    b: __m512,
+    c: __m512i,
+    imm8: i32,
+) -> __m512 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmpsz(
+                a.as_f32x16(),
+                b.as_f32x16(),
+                c.as_i32x16(),
+                $imm8,
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_pd&expand=2490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_fixupimm_pd(a: __m512d, b: __m512d, c: __m512i, imm8: i32) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmpd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                c.as_i64x8(),
+                $imm8,
+                0b11111111,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_pd&expand=2491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_fixupimm_pd(
+    a: __m512d,
+    k: __mmask8,
+    b: __m512d,
+    c: __m512i,
+    imm8: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmpd(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                c.as_i64x8(),
+                $imm8,
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_pd&expand=2492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_fixupimm_pd(
+    k: __mmask8,
+    a: __m512d,
+    b: __m512d,
+    c: __m512i,
+    imm8: i32,
+) -> __m512d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vfixupimmpdz(
+                a.as_f64x8(),
+                b.as_f64x8(),
+                c.as_i64x8(),
+                $imm8,
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -20342,6 +20510,74 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_ps() {
+        let a = _mm512_set1_ps(f32::NAN);
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_fixupimm_ps(a, b, c, 5);
+        let e = _mm512_set1_ps(0.0);
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_ps() {
+        let a = _mm512_set_ps(
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_mask_fixupimm_ps(a, 0b11111111_00000000, b, c, 5);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+        );
+        assert_eq_m512(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_ps() {
+        let a = _mm512_set_ps(
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            f32::NAN,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+            1.,
+        );
+        let b = _mm512_set1_ps(f32::MAX);
+        let c = _mm512_set1_epi32(i32::MAX);
+        let r = _mm512_maskz_fixupimm_ps(0b11111111_00000000, a, b, c, 5);
+        let e = _mm512_set_ps(
+            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+        );
+        assert_eq_m512(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_ps() {
         let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index bb359f1e38..734470a9bb 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1056,25 +1056,35 @@ mod tests {
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_fixupimm_round_pd() {
+    unsafe fn test_mm512_fixupimm_pd() {
         let a = _mm512_set1_pd(f64::NAN);
         let b = _mm512_set1_pd(f64::MAX);
         let c = _mm512_set1_epi64(i32::MAX as i64);
-        let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_fixupimm_pd(a, b, c, 5);
         let e = _mm512_set1_pd(0.0);
         assert_eq_m512d(r, e);
     }
 
     #[simd_test(enable = "avx512f")]
-    unsafe fn test_mm512_mask_fixupimm_round_pd() {
+    unsafe fn test_mm512_mask_fixupimm_pd() {
         let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
         let b = _mm512_set1_pd(f64::MAX);
         let c = _mm512_set1_epi64(i32::MAX as i64);
-        let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let r = _mm512_mask_fixupimm_pd(a, 0b11110000, b, c, 5);
         let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_pd(0b11110000, a, b, c, 5);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_pd() {
         let a = _mm512_set1_pd(10.);
@@ -2839,6 +2849,36 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_fixupimm_round_pd() {
+        let a = _mm512_set1_pd(f64::NAN);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set1_pd(0.0);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+        assert_eq_m512d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_fixupimm_round_pd() {
+        let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+        let b = _mm512_set1_pd(f64::MAX);
+        let c = _mm512_set1_epi64(i32::MAX as i64);
+        let r = _mm512_maskz_fixupimm_round_pd(0b11110000, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+        assert_eq_m512d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_round_pd() {
         let a = _mm512_set1_pd(10.);

From aa32a1f233dbba9b789388ce1f1cfa2ae602a427 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Sun, 11 Oct 2020 23:51:44 +0000
Subject: [PATCH 08/25] ternarylogic_epi32

---
 crates/core_arch/avx512f.md         |   6 +-
 crates/core_arch/src/x86/avx512f.rs | 103 ++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index eafcd9291b..055e3df356 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -662,7 +662,7 @@
   * [x] [`_mm512_mask_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ps&expand=5236)
   * [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236)
   * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
-  * [ ] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
+  * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
   * [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
   * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
   * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
@@ -926,7 +926,7 @@
   * [x] [`_mm512_maskz_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ps&expand=5236)
   * [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236)
   * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236)
-  * [ ] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
+  * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
   * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
@@ -1106,7 +1106,7 @@
   * [x] [`_mm512_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_pd&expand=5236)
   * [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236)
   * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
-  * [ ] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
+  * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
   * [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
   * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
   * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 164630ff51..080de6a1cc 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2412,6 +2412,70 @@ pub unsafe fn _mm512_maskz_fixupimm_pd(
     transmute(r)
 }
 
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi32&expand=5867)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_ternarylogic_epi32(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi32&expand=5865)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi32(
+    src: __m512i,
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16(), $imm8)
+        };
+    }
+    let ternarylogic = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, ternarylogic, src.as_i32x16()))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi32&expand=5866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi32(
+    k: __mmask16,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+    imm8: i32,
+) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8)
+        };
+    }
+    let ternarylogic = constify_imm8_sae!(imm8, call);
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, ternarylogic, zero))
+}
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -18377,6 +18441,11 @@ extern "C" {
     #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
     fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
 
+    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
+    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, sae: i32) -> i32x16;
+    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
+    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, sae: i32) -> i64x8;
+
     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -20578,6 +20647,40 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_ternarylogic_epi32(a, b, c, 8);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi32() {
+        let src = _mm512_set1_epi32(1 << 2);
+        let a = _mm512_set1_epi32(1 << 1);
+        let b = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi32(src, 0, a, b, 8);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi32(src, 0b11111111_11111111, a, b, 8);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
+        let a = _mm512_set1_epi32(1 << 2);
+        let b = _mm512_set1_epi32(1 << 1);
+        let c = _mm512_set1_epi32(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi32(0, a, b, c, 9);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi32(0b11111111_11111111, a, b, c, 8);
+        let e = _mm512_set1_epi32(0);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_ps() {
         let a = _mm512_set1_ps(10.);

From a25eaf3022bf1f9283a44a7a2c5b5caaf2c31680 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 12 Oct 2020 00:03:59 +0000
Subject: [PATCH 09/25] ternarylogic_epi64

---
 crates/core_arch/avx512f.md            |  6 +--
 crates/core_arch/src/x86/avx512f.rs    | 66 ++++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs | 34 +++++++++++++
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 055e3df356..443adee4b9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -663,7 +663,7 @@
   * [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236)
   * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
   * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
-  * [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
+  * [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
   * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
   * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
   * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
@@ -927,7 +927,7 @@
   * [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236)
   * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236)
   * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
+  * [x] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
   * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
@@ -1107,7 +1107,7 @@
   * [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236)
   * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
   * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
-  * [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
+  * [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
   * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
   * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
   * [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 080de6a1cc..c7156b6d4c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2476,6 +2476,72 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi32(
     transmute(simd_select_bitmask(k, ternarylogic, zero))
 }
 
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi64&expand=5876)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_ternarylogic_epi64(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8)
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi64&expand=5874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi64(
+    src: __m512i,
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    imm8: i32,
+) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogq(src.as_i64x8(), a.as_i64x8(), b.as_i64x8(), $imm8)
+        };
+    }
+    let ternarylogic = constify_imm8_sae!(imm8, call);
+    transmute(simd_select_bitmask(k, ternarylogic, src.as_i64x8()))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi64&expand=5875)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi64(
+    k: __mmask8,
+    a: __m512i,
+    b: __m512i,
+    c: __m512i,
+    imm8: i32,
+) -> __m512i {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8)
+        };
+    }
+    let ternarylogic = constify_imm8_sae!(imm8, call);
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, ternarylogic, zero))
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 734470a9bb..3f7d67dc04 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1085,6 +1085,40 @@ mod tests {
         assert_eq_m512d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_ternarylogic_epi64(a, b, c, 8);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_ternarylogic_epi64() {
+        let src = _mm512_set1_epi64(1 << 2);
+        let a = _mm512_set1_epi64(1 << 1);
+        let b = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_mask_ternarylogic_epi64(src, 0, a, b, 8);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_ternarylogic_epi64(src, 0b11111111, a, b, 8);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_ternarylogic_epi64() {
+        let a = _mm512_set1_epi64(1 << 2);
+        let b = _mm512_set1_epi64(1 << 1);
+        let c = _mm512_set1_epi64(1 << 0);
+        let r = _mm512_maskz_ternarylogic_epi64(0, a, b, c, 9);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_ternarylogic_epi64(0b11111111, a, b, c, 8);
+        let e = _mm512_set1_epi64(0);
+        assert_eq_m512i(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_getmant_pd() {
         let a = _mm512_set1_pd(10.);

From 7624f5bfa772518bcf4ab8edd2e0fe236a4d4765 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 12 Oct 2020 21:16:37 +0000
Subject: [PATCH 10/25] int2mask, mask2int, stream: ps,pd,si512

---
 crates/core_arch/avx512f.md            | 10 +--
 crates/core_arch/src/x86/avx512f.rs    | 86 ++++++++++++++++++++++++++
 crates/core_arch/src/x86/test.rs       | 15 +++++
 crates/core_arch/src/x86_64/avx512f.rs | 30 +++++++++
 4 files changed, 136 insertions(+), 5 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 443adee4b9..184fb92aa8 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -226,7 +226,7 @@
   * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236)
   * [x] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236)
   * [x] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236)
-  * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
+  * [x] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
   * [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236)
   * [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236)
   * [x] [`_mm512_kmov`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kmov&expand=5236)
@@ -251,7 +251,7 @@
   * [x] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236)
   * [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236)
   * [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236)
-  * [ ] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236)
+  * [x] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236)
   * [x] [`_mm512_mask3_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=5236)
   * [x] [`_mm512_mask3_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=5236)
   * [x] [`_mm512_mask3_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pd&expand=5236)
@@ -1096,9 +1096,9 @@
   * [x] [`_mm512_storeu_epi64`]
   * [x] [`_mm512_storeu_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_si512&expand=5236)
   * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512&expand=5236)
-  * [ ] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236)
-  * [ ] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236)
-  * [ ] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236)
+  * [x] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236)
+  * [x] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236)
+  * [x] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236)
   * [x] [`_mm512_sub_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi32&expand=5236)
   * [x] [`_mm512_sub_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi64&expand=5236)
   * [x] [`_mm512_sub_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c7156b6d4c..c5bfc267db 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15812,6 +15812,61 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
     transmute(r)
 }
 
+/// Converts integer mask into bitmask, storing the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
+    assert!(mask >= 0);
+    let r: u16 = mask as u16;
+    transmute(r)
+}
+
+/// Converts bit mask k1 into an integer value, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
+    let r: i32 = k1 as i32;
+    transmute(r)
+}
+
+/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_pd&expand=5667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
+}
+
+/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_si512&expand=5675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
+    intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
+}
+
 /// Sets packed 32-bit integers in `dst` with the supplied values.
 ///
 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
@@ -27213,6 +27268,37 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_int2mask() {
+        let a: i32 = 0b11001100_00110011;
+        let r = _mm512_int2mask(a);
+        let e: u16 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask2int() {
+        let k1: __mmask16 = 0b11001100_00110011;
+        let r = _mm512_mask2int(k1);
+        let e: i32 = 0b11001100_00110011;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_ps() {
+        #[repr(align(32))]
+        struct Memory {
+            pub data: [f32; 16],
+        }
+        let a = _mm512_set1_ps(7.0);
+        let mut mem = Memory { data: [-1.0; 16] };
+
+        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+        for i in 0..16 {
+            assert_eq!(mem.data[i], get_m512(a, i));
+        }
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_reduce_add_epi32() {
         let a = _mm512_set1_epi32(1);
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 594f680b73..9014d66fd0 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -71,6 +71,21 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
     transmute::<_, [f32; 8]>(a)[idx]
 }
 
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
+    transmute::<_, [f32; 16]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
+    transmute::<_, [f64; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
+    transmute::<_, [i64; 8]>(a)[idx]
+}
+
 // These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
 // which doesn't exist on x86!
 #[cfg(target_arch = "x86")]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3f7d67dc04..3d5b22c64b 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6257,4 +6257,34 @@ mod tests {
         _mm512_store_pd(&mut r as *mut _ as *mut f64, a);
         assert_eq_m512d(r, a);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_pd() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [f64; 8],
+        }
+        let a = _mm512_set1_pd(7.0);
+        let mut mem = Memory { data: [-1.0; 8] };
+
+        _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512d(a, i));
+        }
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_stream_si512() {
+        #[repr(align(64))]
+        struct Memory {
+            pub data: [i64; 8],
+        }
+        let a = _mm512_set1_epi64(7);
+        let mut mem = Memory { data: [-1; 8] };
+
+        _mm512_stream_si512(&mut mem.data[0] as *mut i64, a);
+        for i in 0..8 {
+            assert_eq!(mem.data[i], get_m512i(a, i));
+        }
+    }
 }

From c0051783d8bb30d5f301d1d6fc2f694a91ff113f Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 12 Oct 2020 22:03:55 +0000
Subject: [PATCH 11/25] mask_set1: epi32,epi64, maskz_set1: epi32,epi64

---
 crates/core_arch/avx512f.md              |  6 +--
 crates/core_arch/src/x86/avx512f.rs      | 67 ++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs   | 21 ++++++++
 crates/stdarch-verify/tests/x86-intel.rs |  4 +-
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 184fb92aa8..2534fb7a5f 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -617,7 +617,7 @@
   * [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
   * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
   * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
-  * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
+  * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
   * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
   * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
   * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236)
@@ -889,8 +889,8 @@
   * [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
   * [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
   * [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
-  * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
+  * [x] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
+  * [x] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
   * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236)
   * [x] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236)
   * [x] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c5bfc267db..18b0568f08 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15990,6 +15990,29 @@ pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
     transmute(i32x16::splat(a))
 }
 
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi32&expand=4951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+    let r = _mm512_set1_epi32(a).as_i32x16();
+    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi32&expand=4952)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+    let r = _mm512_set1_epi32(a).as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Broadcast 64-bit integer `a` to all elements of `dst`.
 #[inline]
 #[target_feature(enable = "avx512f")]
@@ -15997,6 +16020,29 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
     transmute(i64x8::splat(a))
 }
 
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi64&expand=4959)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+    let r = _mm512_set1_epi64(a).as_i64x8();
+    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi64&expand=4960)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+    let r = _mm512_set1_epi64(a).as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, r, zero))
+}
+
 /// Set packed 64-bit integers in dst with the repeated 4 element sequence.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_epi64&expand=4983)
@@ -27669,4 +27715,25 @@ mod tests {
         _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
         assert_eq_m512(r, a);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi32() {
+        let src = _mm512_set1_epi32(2);
+        let a: i32 = 11;
+        let r = _mm512_mask_set1_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi32() {
+        let a: i32 = 11;
+        let r = _mm512_maskz_set1_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
+        let e = _mm512_set1_epi32(11);
+        assert_eq_m512i(r, e);
+    }
 }
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3d5b22c64b..3dbbc77a89 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6287,4 +6287,25 @@ mod tests {
             assert_eq!(mem.data[i], get_m512i(a, i));
         }
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_set1_epi64() {
+        let src = _mm512_set1_epi64(2);
+        let a: i64 = 11;
+        let r = _mm512_mask_set1_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_set1_epi64(src, 0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_set1_epi64() {
+        let a: i64 = 11;
+        let r = _mm512_maskz_set1_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_set1_epi64(0b11111111, a);
+        let e = _mm512_set1_epi64(11);
+        assert_eq_m512i(r, e);
+    }
 }
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index d5096a6903..0e44f73a7f 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -592,7 +592,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
         | "_mm512_reduce_and_epi64"
         | "_mm512_mask_reduce_and_epi64"
         | "_mm512_reduce_or_epi64"
-        | "_mm512_mask_reduce_or_epi64" => true,
+        | "_mm512_mask_reduce_or_epi64"
+        | "_mm512_mask_set1_epi64"
+        | "_mm512_maskz_set1_epi64" => true,
 
         // These return a 64-bit argument but they're assembled from other
         // 32-bit registers, so these work on 32-bit just fine. See #308 for

From 4091be50ab71f34ae4f44813f40ecddf9e9315a2 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 12 Oct 2020 23:03:04 +0000
Subject: [PATCH 12/25] test_epi32_mask, test_epi64_mask, testn_epi32_mask,
 testn_epi64_mask

---
 crates/core_arch/avx512f.md            |  16 +--
 crates/core_arch/src/x86/avx512f.rs    | 138 ++++++++++++++++++++++++-
 crates/core_arch/src/x86_64/avx512f.rs |  40 +++++++
 3 files changed, 185 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 2534fb7a5f..43b2597510 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -664,10 +664,10 @@
   * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
   * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
   * [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
-  * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
-  * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
-  * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
-  * [ ] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
+  * [x] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
+  * [x] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
+  * [x] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
+  * [x] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
   * [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236)
   * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
   * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
@@ -1108,10 +1108,10 @@
   * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
   * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
   * [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
-  * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
-  * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
-  * [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
-  * [ ] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236)
+  * [x] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
+  * [x] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
+  * [x] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
+  * [x] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236)
   * [x] [`_mm512_undefined_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_epi32&expand=5236)
   * [x] [`_mm512_undefined_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd&expand=5236)
   * [x] [`_mm512_undefined_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 18b0568f08..1ef726fba5 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15827,13 +15827,109 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
 #[inline]
-#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[target_feature(enable = "avx512f")]
 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
     let r: i32 = k1 as i32;
     transmute(r)
 }
 
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi32_mask&expand=5890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi32_mask&expand=5889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi64_mask&expand=5896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi64_mask&expand=5895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi32_mask&expand=5921)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi32_mask&expand=5920)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+    let and = _mm512_and_epi32(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi64_mask&expand=5927)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi64_mask&expand=5926)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+    let and = _mm512_and_epi64(a, b);
+    let zero = _mm512_setzero_si512();
+    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
 /// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671)
@@ -27330,6 +27426,46 @@ mod tests {
         assert_eq!(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi32_mask(a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi32_mask(a, b);
+        let e: __mmask16 = 0b00000000_00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi32_mask() {
+        let a = _mm512_set1_epi32(1 << 0);
+        let b = _mm512_set1_epi32(1 << 1);
+        let r = _mm512_mask_test_epi32_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
+        let e: __mmask16 = 0b11111111_11111111;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_stream_ps() {
         #[repr(align(32))]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3dbbc77a89..758a2b563c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6258,6 +6258,46 @@ mod tests {
         assert_eq_m512d(r, a);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_test_epi64_mask(a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_test_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_mask_test_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_test_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+        let r = _mm512_testn_epi64_mask(a, b);
+        let e: __mmask8 = 0b00000000;
+        assert_eq!(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_testn_epi64_mask() {
+        let a = _mm512_set1_epi64(1 << 0);
+        let b = _mm512_set1_epi64(1 << 1);
+        let r = _mm512_mask_testn_epi64_mask(0, a, b);
+        assert_eq!(r, 0);
+        let r = _mm512_mask_testn_epi64_mask(0b11111111, a, b);
+        let e: __mmask8 = 0b11111111;
+        assert_eq!(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_stream_pd() {
         #[repr(align(64))]

From 71764a1817bee74ca790cf5257d431f3dcee52e4 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Mon, 12 Oct 2020 23:37:07 +0000
Subject: [PATCH 13/25] mask_mov: epi32,epi64,ps,pd; maskz_mov:
 epi32,epi64,ps,pd

---
 crates/core_arch/avx512f.md            |  18 ++--
 crates/core_arch/src/x86/avx512f.rs    | 130 +++++++++++++++++++++++++
 crates/core_arch/src/x86_64/avx512f.rs |  38 ++++++++
 3 files changed, 177 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 43b2597510..57b3266a0d 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -541,10 +541,10 @@
   * [x] [`_mm512_mask_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ps&expand=5236)
   * [x] [`_mm512_mask_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_pd&expand=5236)
   * [x] [`_mm512_mask_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ps&expand=5236)
-  * [ ] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236)
-  * [ ] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236)
-  * [ ] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236)
-  * [ ] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236)
+  * [x] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236)
+  * [x] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236)
+  * [x] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236)
+  * [x] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236)
   * [x] [`_mm512_mask_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=5236)
   * [x] [`_mm512_mask_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup_ps&expand=5236)
   * [x] [`_mm512_mask_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=5236)
@@ -618,7 +618,7 @@
   * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
   * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
   * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
-  * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
+  * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
   * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
   * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236)
   * [x] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236)
@@ -839,10 +839,10 @@
   * [x] [`_mm512_maskz_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ps&expand=5236)
   * [x] [`_mm512_maskz_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_pd&expand=5236)
   * [x] [`_mm512_maskz_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ps&expand=5236)
-  * [ ] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236)
-  * [ ] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236)
-  * [ ] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236)
-  * [ ] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236)
+  * [x] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236)
+  * [x] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236)
+  * [x] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236)
+  * [x] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236)
   * [x] [`_mm512_maskz_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=5236)
   * [x] [`_mm512_maskz_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movehdup_ps&expand=5236)
   * [x] [`_mm512_maskz_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 1ef726fba5..2ebfcca5b9 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -135,6 +135,98 @@ pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m5
     transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
 }
 
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi32&expand=3801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+    let mov = a.as_i32x16();
+    transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi32&expand=3802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+    let mov = a.as_i32x16();
+    let zero = _mm512_setzero_si512().as_i32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi64&expand=3807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+    let mov = a.as_i64x8();
+    transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi64&expand=3808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+    let mov = a.as_i64x8();
+    let zero = _mm512_setzero_si512().as_i64x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_ps&expand=3825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+    let mov = a.as_f32x16();
+    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_ps&expand=3826)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+    let mov = a.as_f32x16();
+    let zero = _mm512_setzero_ps().as_f32x16();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_pd&expand=3819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+    let mov = a.as_f64x8();
+    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_pd&expand=3820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+    let mov = a.as_f64x8();
+    let zero = _mm512_setzero_pd().as_f64x8();
+    transmute(simd_select_bitmask(k, mov, zero))
+}
+
 /// Add packed 32-bit integers in a and b, and store the results in dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_epi32&expand=100)
@@ -19110,6 +19202,44 @@ mod tests {
         assert_eq_m512(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi32() {
+        let src = _mm512_set1_epi32(1);
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_mask_mov_epi32(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi32() {
+        let a = _mm512_set1_epi32(2);
+        let r = _mm512_maskz_mov_epi32(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_ps() {
+        let src = _mm512_set1_ps(1.);
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_mask_mov_ps(src, 0, a);
+        assert_eq_m512(r, src);
+        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_ps() {
+        let a = _mm512_set1_ps(2.);
+        let r = _mm512_maskz_mov_ps(0, a);
+        assert_eq_m512(r, _mm512_setzero_ps());
+        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
+        assert_eq_m512(r, a);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_add_epi32() {
         let a = _mm512_setr_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 758a2b563c..1b7c4ee088 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -68,6 +68,44 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_epi64() {
+        let src = _mm512_set1_epi64(1);
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_mask_mov_epi64(src, 0, a);
+        assert_eq_m512i(r, src);
+        let r = _mm512_mask_mov_epi64(src, 0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_epi64() {
+        let a = _mm512_set1_epi64(2);
+        let r = _mm512_maskz_mov_epi64(0, a);
+        assert_eq_m512i(r, _mm512_setzero_si512());
+        let r = _mm512_maskz_mov_epi64(0b11111111, a);
+        assert_eq_m512i(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_mask_mov_pd() {
+        let src = _mm512_set1_pd(1.);
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_mask_mov_pd(src, 0, a);
+        assert_eq_m512d(r, src);
+        let r = _mm512_mask_mov_pd(src, 0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm512_maskz_mov_pd() {
+        let a = _mm512_set1_pd(2.);
+        let r = _mm512_maskz_mov_pd(0, a);
+        assert_eq_m512d(r, _mm512_setzero_pd());
+        let r = _mm512_maskz_mov_pd(0b11111111, a);
+        assert_eq_m512d(r, a);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm512_mask_add_epi64() {
         let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);

From be334bb202593eaf848600ee7fb49ab715ce6cd2 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 13:31:26 +0000
Subject: [PATCH 14/25] mask_add: ss,sd

---
 crates/core_arch/avx512f.md         |    8 +-
 crates/core_arch/src/x86/avx512f.rs |  120 ++
 examples/connect5_14_avx512.rs      | 2100 +++++++++++++++++++++++++++
 3 files changed, 2224 insertions(+), 4 deletions(-)
 create mode 100644 examples/connect5_14_avx512.rs

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 57b3266a0d..01328a43f8 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1240,8 +1240,8 @@
   * [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236)
   * [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
   * [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
-  * [ ] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
-  * [ ] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
+  * [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
+  * [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
   * [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236)
   * [x] [`_mm_mask_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236)
   * [x] [`_mm_mask_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236)
@@ -1322,8 +1322,8 @@
   * [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
   * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
   * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
-  * [ ] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
+  * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
+  * [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
   * [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236)
   * [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236)
   * [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 2ebfcca5b9..7804a8ce44 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18417,6 +18417,76 @@ pub unsafe fn _mm512_set_pd(
     _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
 }
 
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta + extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
 /// Equal
 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
 /// Less-than
@@ -28002,4 +28072,54 @@ mod tests {
         let e = _mm512_set1_epi32(11);
         assert_eq_m512i(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_add_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_add_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
 }
diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs
new file mode 100644
index 0000000000..696adfc70c
--- /dev/null
+++ b/examples/connect5_14_avx512.rs
@@ -0,0 +1,2100 @@
+#![feature(stdsimd, avx512_target_feature)]
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*};
+
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+use rand::Rng;
+
+use std::cmp;
+
+use std::time::{Duration, Instant};
+
+// types
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum Color {
+    Black,
+    White,
+    Empty,
+}
+
+type Square = i32; 
+type Move = i32;
+type Side = Color;
+type Piece = Color;
+
+// constants
+
+const FILE_SIZE: i32 = 15;
+const RANK_SIZE: i32 = 15;
+const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4;
+
+const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100);
+const MOVE_NONE: Move = -1;
+const SCORE_NONE: i32 = -EVAL_INF - 1;
+
+const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1],
+                                   [-3, -2, -1,  1],
+                                   [-2, -1,  1,  2],
+                                   [-1,  1,  2,  3],
+                                   [ 1,  2,  3,  4],
+
+                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)],
+                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)],
+                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)],
+                             [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)],
+                             [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)],
+
+                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)],
+                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)],
+                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)],
+                             [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)],
+                             [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)],
+
+                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)],
+                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)],
+                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)],
+                             [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)],
+                             [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)]
+                           ];
+
+const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
+const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)];
+const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)];
+const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)];
+
+const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17],
+
+                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
+                                         0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30,
+                                         0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29,
+                                         0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28,
+                                         0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
+                                         0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
+                                         0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
+                                         0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
+                                         0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
+                                         0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
+                                         0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
+                                         0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20,
+                                         0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19,
+                                         0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18,
+                                         0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
+
+                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0,     0,     0,     0,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0,     0,     0,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0,     0,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
+                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
+                                         0, 0, 0, 0, 0,     1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20,
+                                         0, 0, 0, 0, 0,     0,     1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19,
+                                         0, 0, 0, 0, 0,     0,     0,     1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18,
+                                         0, 0, 0, 0, 0,     0,     0,     0,     1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
+
+                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0,     0,     0,     0,     1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
+                                         0, 0, 0, 0, 0,     0,     0,     1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 0,     0,     1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 0,     1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+                                         0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0,
+                                         0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0,     0,
+                                         0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0,     0,     0,
+                                         0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0,     0,     0,     0] 
+                                        ];
+
+const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                       0, 0, 0, 0,    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                       0, 0, 0, 0,    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+                                       0, 0, 0, 0,    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+                                       0, 0, 0, 0,    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+                                       0, 0, 0, 0,    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+                                       0, 0, 0, 0,    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+                                       0, 0, 0, 0,    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+                                       0, 0, 0, 0,    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                       0, 0, 0, 0,    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+                                       0, 0, 0, 0,    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+                                       0, 0, 0, 0,    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+                                       0, 0, 0, 0,    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+                                       0, 0, 0, 0,    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
+
+                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+
+                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0,    10,  9, 8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,
+                                       0, 0, 0, 0,    11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,
+                                       0, 0, 0, 0,    12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,
+                                       0, 0, 0, 0,    13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,
+                                       0, 0, 0, 0,    14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
+                                       0, 0, 0, 0,    15, 14, 13, 6,  11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,
+                                       0, 0, 0, 0,    16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,
+                                       0, 0, 0, 0,    17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,
+                                       0, 0, 0, 0,    18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,
+                                       0, 0, 0, 0,    19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,
+                                       0, 0, 0, 0,    20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,
+                                       0, 0, 0, 0,    0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,
+                                       0, 0, 0, 0,    0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,
+                                       0, 0, 0, 0,    0,  0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9
+                                       0, 0, 0, 0,    0,  0,  0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10],
+
+                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                       0, 0, 0, 0,    0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
+                                       0, 0, 0, 0,    0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
+                                       0, 0, 0, 0,    0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                                       0, 0, 0, 0,    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
+                                       0, 0, 0, 0,    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                                       0, 0, 0, 0,    1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                       0, 0, 0, 0,    2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                                       0, 0, 0, 0,    3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
+                                       0, 0, 0, 0,    4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
+                                       0, 0, 0, 0,    5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                       0, 0, 0, 0,    6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                                       0, 0, 0, 0,    7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,
+                                       0, 0, 0, 0,    8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,
+                                       0, 0, 0, 0,    9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,  0,
+                                       0, 0, 0, 0,    10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,  0,  0]
+                                ];
+
+// variables
+
+static mut Endgame: bool = false;
+
+// structures
+
+pub struct Pos { // position
+    state: [Color; SQUARE_SIZE as usize],
+    p_turn: Side,
+    p_last: Move,
+
+    bitboard: [[[i32; 20]; 4]; 3], 
+
+}
+
+impl Pos {
+
+    pub fn init(&mut self) { // starting position
+        for i in 0..SQUARE_SIZE as usize {
+            self.state[i] = Color::Empty;
+        }
+
+        self.p_turn = Color::Black;
+        self.p_last = square_make(0, 0);
+
+        //--------------------------------------------
+
+        for i in 0..4 {
+            for j in 0..20 { 
+                self.bitboard[Color::Black as usize][i][j] = 0; 
+            }
+        }
+
+        for i in 0..4 {
+            for j in 0..20 { 
+                self.bitboard[Color::White as usize][i][j] = 0;
+            }
+        }
+
+        for i in 0..2 {
+            for j in 0..20 { 
+                self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); 
+            }
+        }
+        
+        self.bitboard[Color::Empty as usize][2][0]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+        self.bitboard[Color::Empty as usize][2][1]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+        self.bitboard[Color::Empty as usize][2][2]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+        self.bitboard[Color::Empty as usize][2][3]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+        self.bitboard[Color::Empty as usize][2][4]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+        self.bitboard[Color::Empty as usize][2][5]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+        self.bitboard[Color::Empty as usize][2][6]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+        self.bitboard[Color::Empty as usize][2][7]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+        self.bitboard[Color::Empty as usize][2][8]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
+        self.bitboard[Color::Empty as usize][2][9]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+        self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+        self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+        self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+        self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+        self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+        self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+        self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+        self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+        self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+        self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+
+        self.bitboard[Color::Empty as usize][3][0]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+        self.bitboard[Color::Empty as usize][3][1]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+        self.bitboard[Color::Empty as usize][3][2]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+        self.bitboard[Color::Empty as usize][3][3]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+        self.bitboard[Color::Empty as usize][3][4]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+        self.bitboard[Color::Empty as usize][3][5]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+        self.bitboard[Color::Empty as usize][3][6]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+        self.bitboard[Color::Empty as usize][3][7]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+        self.bitboard[Color::Empty as usize][3][8]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
+        self.bitboard[Color::Empty as usize][3][9]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+        self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+        self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+        self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+        self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+        self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+        self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+        self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+        self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+        self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+        self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+    } 
+
+    pub fn do_move(&mut self, mv: Move) {
+
+        let atk: Side = self.p_turn;
+        let def: Side = side_opp(atk);
+
+        match self.p_turn {
+            Color::Black => { self.state[mv as usize] = Color::Black;
+
+                              for i in 0..4 {
+                                  self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
+                                  self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
+                              }
+            },
+
+            Color::White => { self.state[mv as usize] = Color::White;
+
+                              for i in 0..4 {
+                                  self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
+                                  self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
+                              }
+            },
+
+            Color::Empty => {},
+        }
+
+        self.p_last = mv;
+
+        self.p_turn = def; 
+    }
+
+    fn turn(&self) -> Side {
+        self.p_turn
+    }
+
+    pub fn can_play(&self, from: Square) -> bool {
+
+        if self.state[from as usize] == Color::Empty { true } else { false }
+    }
+
+    pub fn count(&self, pc: Piece) -> i32 {
+        
+        let mut n: i32 = 0;
+
+        for rk in 0..RANK_SIZE {
+            for fl in 0..FILE_SIZE {
+                let sq: Square = square_make(fl, rk);
+                if self.state[sq as usize] == pc { n += 1; }
+            }
+        }
+        n
+    }
+}
+
+pub struct List {  // legal move list
+    p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
+    p_size: i32,
+}
+
+impl List {
+    
+    pub fn clear(&mut self) {
+        self.p_size = 0;
+    }
+
+    pub fn add(&mut self, mv: Move) {
+        self.p_move[self.p_size as usize] = mv;
+        self.p_size += 1;
+    }
+
+    pub fn size(&self) -> i32 {
+        self.p_size
+    }
+
+    pub fn shuffle(&mut self) {
+
+        let mut rng = thread_rng();
+
+        let num = self.p_size;
+        
+        let mut new_move: Vec<Move> = vec![];
+
+        for x in 0..(num as usize) {
+            new_move.push(self.p_move[x]);
+        }
+
+        new_move.shuffle(&mut rng);
+
+        for x in 0..(self.p_size as usize) {
+            self.p_move[x] = new_move[x];
+        }
+    }
+
+    //pub fn move(&self, i: i32) -> Move {
+    //    self.p_move[i as usize]
+    //}
+}
+
+// functions
+//
+fn square_make(fl: i32, rk: i32) -> Square {
+    (rk + 4) * (FILE_SIZE + 4) + (fl + 4)
+}
+
+fn square_file(sq: Square) -> i32 {
+    sq % (FILE_SIZE + 4) - 4
+}
+
+fn square_rank(sq: Square) -> i32 {
+    sq / (FILE_SIZE + 4) - 4
+}
+
+fn side_opp(sd: Side) -> Side {
+
+    let mut out: Side; 
+
+    match sd {
+        Side::White => out = Side::Black,
+        Side::Black => out = Side::White,
+        Side::Empty => panic!(""),
+    }
+
+    out
+}
+
+fn pos_is_winner(pos : &Pos) -> bool {
+
+    let current_side = side_opp(pos.p_turn);
+
+    let mut found : bool = true;
+    
+    for x in 0..20 {
+        for y in 0..4 {
+
+            found = true;
+
+            let adj = pos.p_last + ENDCHECK[x][y];
+      
+            if pos.state[adj as usize] != current_side { found = false; break }
+        }
+        if found == true { break; } 
+    }
+
+    found
+}
+
+fn pos_is_winner_scan(pos : &Pos) -> bool {
+
+   let current_side = side_opp(pos.p_turn);
+
+   if check_patternfile5(&pos, current_side) ||
+      check_patternrank5(&pos, current_side) ||
+      check_patterndial5(&pos, current_side) ||
+      check_patterndiar5(&pos, current_side) { return true }
+    
+   false
+}
+
+fn pos_is_draw(pos : &Pos) -> bool {
+
+    
+    let mut found : bool = true;
+        
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+
+            let sq: Square = square_make(fl, rk);
+            if  pos.can_play(sq) {
+                found = false;
+                break;
+            }
+
+        if found == false { break;}
+        }
+    }
+/*
+    
+    let mut test: bool = false;
+
+    if pos.bitboard[Color::Empty as usize][0][0] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][1] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][2] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][3] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][4] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][5] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][6] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][7] == 0 &&
+       pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; }
+*/
+   //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); }
+
+    let mut out: bool = false;
+
+    //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; }
+    if found == true && !pos_is_winner_scan(pos) { out = true; }
+
+    out
+}
+
+fn pos_is_end(pos : &Pos) -> bool {
+
+    if pos_is_winner_scan(pos) || pos_is_draw(pos) { 
+        true 
+    } else {
+        false
+    }
+}
+
+fn pos_disp(pos: &Pos) {
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+
+            let sq: Square = square_make(fl, rk);
+
+            match pos.state[sq as usize] {
+                Color::Black => print!("# "),
+                Color::White => print!("O "),
+                Color::Empty => print!("- "),
+            }
+        }
+
+        println!("");    
+    }
+
+    match pos.turn() {
+        Color::Black => println!("black to play"),
+        Color::White => println!("white to play"),
+        _ => (),
+    }
+}
+
+fn gen_moves(list : &mut List, pos: &Pos) {
+
+    list.clear();
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+            if pos.can_play(sq) { list.add(sq); }
+        }
+    }
+
+}
+
+fn search(pos : &Pos, depth: i32, endgame: i32) -> Move {
+
+    //println!("call search");
+
+    let mut new_depth = depth;
+
+    let empties: i32 = pos.count(Color::Empty);
+    if (empties <= endgame || new_depth > empties ) { new_depth = empties; }
+
+    if(new_depth == empties) { unsafe { Endgame = true; } }
+
+    search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0)
+
+}
+
+fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 {
+
+
+    assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
+    //println!("call search_real");
+    //println!("depth = {}", depth);
+    //println!("ply   = {}", ply);
+    // leaf?
+
+    //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); }
+    if pos_is_winner_scan(&pos) { return -EVAL_INF + ply }
+    //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply }
+
+
+    if pos_is_draw(&pos) { return 0 }
+
+    if depth == 0 {
+         return eval(&pos, ply)
+    }
+
+    let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize];
+
+    let mut list = List {
+    p_move: p_move_new,
+    p_size: 0,
+    };
+
+    let mut bm: Move = MOVE_NONE;
+    let mut bs: i32  = SCORE_NONE;
+
+    gen_moves(&mut list, &pos);
+
+    // move loop
+
+    if ply == 0 { list.shuffle(); }
+
+    for i in 0..list.size() {
+
+        if bs < beta { 
+
+        let mv: Move = list.p_move[i as usize];
+
+        let mut new_pos = Pos {
+            state: pos.state,
+            p_turn: pos.p_turn,
+            p_last: pos.p_last,
+
+            bitboard: pos.bitboard,
+        };
+
+        //println!("p_last = {}", new_pos.p_last);
+
+        new_pos.do_move(mv);
+
+        //println!("After do _move p_last = {}", new_pos.p_last);
+
+        let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1);
+
+        
+        //if sc >= 410 || sc <= -410 {
+        //println!("sc = {} depth = {}-------------------------------", sc, depth);
+
+        //pos_disp(&new_pos);
+        //}
+        
+
+        if sc > bs { bm = mv; bs = sc; }
+
+        }
+    }
+
+    assert!(bm != MOVE_NONE);
+    assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
+
+    if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere
+    //bs
+}
+
+fn result(pos: &Pos) -> i32 {
+
+   if(pos_is_winner_scan(pos)) {
+       -(FILE_SIZE*RANK_SIZE*100)
+   } else {
+       0
+   }
+}
+
+
+fn eval(pos: &Pos, ply: i32) -> i32 {
+
+    let atk: Side = pos.turn();
+    let def: Side = side_opp(atk);
+
+    //let mut sc: i32 = 0;
+
+    let check_live4: Side = def; 
+    let check_live4_opp: Side = atk; 
+
+    //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
+    //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
+/*
+    if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self ");  pos_disp(&pos); }
+    if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp ");  pos_disp(&pos); }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern4_dead_avx512(&pos, check_live4_opp); 
+
+              let mut temp_check: [__mmask16; 5] = [0; 5];
+
+              for i in 0..5 {
+
+              let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
+              let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
+              let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
+              let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
+              let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
+              let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
+              let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
+              let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
+              let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
+              let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
+
+              let check_mask10 = _kor_mask16(check_mask0, check_mask1);
+              let check_mask11 = _kor_mask16(check_mask2, check_mask3);
+              let check_mask12 = _kor_mask16(check_mask4, check_mask5);
+              let check_mask13 = _kor_mask16(check_mask6, check_mask7);
+              let check_mask14 = _kor_mask16(check_mask8, check_mask9);
+
+              let check_mask16 = _kor_mask16(check_mask10, check_mask11);
+              let check_mask17 = _kor_mask16(check_mask12, check_mask13);
+              let check_mask18 = _kor_mask16(check_mask16, check_mask17);
+              temp_check[i] = _kor_mask16(check_mask18, check_mask14);
+
+              }
+
+              let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
+              let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
+              let check_mask2 = _kor_mask16(check_mask0, check_mask1);
+              let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
+
+              let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp);
+
+              let mut test2: bool = true;
+              
+              if check_mask3 > 0 { test2 = true; } else { test2 = false; }
+
+              if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp ");  pos_disp(&pos); }
+    } 
+
+    #[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern4_dead_avx512(&pos, check_live4); 
+
+              let mut count: i32 = 0;
+
+              for i in 0..5 {
+                  for j in 0..4 {
+                      for k in 0..5 {
+                          count += _popcnt32(result[i][j][k] as i32);
+                      }
+                  }
+              }
+
+              let c4f: i32  = check_patternfile4_dead_n(&pos, check_live4);
+              let c4r: i32  = check_patternrank4_dead_n(&pos, check_live4);
+              let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
+              let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
+
+
+              if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count);  pos_disp(&pos); }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern3_live_avx512(&pos, check_live4); 
+
+              let mut count: i32 = 0;
+
+              for i in 0..3 {
+                  for j in 0..4 {
+                     for k in 0..5 {
+                        count += _popcnt32(result[i][j][k] as i32);
+                     }
+                  }
+              }
+
+              let c3f: i32  = check_patternfile3_live_n(&pos, check_live4);
+              let c3r: i32  = check_patternrank3_live_n(&pos, check_live4);
+              let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
+              let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
+
+              let mut count1: i32 = 0;
+
+              count1 = c3f+c3r+c3dl+c3dr;
+
+              if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count);  pos_disp(&pos); }
+    } 
+*/
+
+    if check_patternfile4_once(&pos, check_live4) || 
+       check_patternrank4_once(&pos, check_live4) ||
+       check_patterndial4_once(&pos, check_live4) ||
+       check_patterndiar4_once(&pos, check_live4) { return -4096 }
+
+    //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 }
+    
+    if check_patternfile4_once(&pos, check_live4_opp) || 
+       check_patternrank4_once(&pos, check_live4_opp) ||
+       check_patterndial4_once(&pos, check_live4_opp) ||
+       check_patterndiar4_once(&pos, check_live4_opp) { return 2560 }
+
+    //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 }
+
+    if check_patternfile4_dead(&pos, check_live4_opp) || 
+       check_patternrank4_dead(&pos, check_live4_opp) ||
+       check_patterndial4_dead(&pos, check_live4_opp) ||
+       check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 }
+
+    /*#[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern4_dead_avx512(&pos, check_live4_opp); 
+
+              let mut temp_check: [__mmask16; 5] = [0; 5];
+
+              for i in 0..5 {
+                  let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
+                  let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
+                  let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
+                  let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
+                  let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
+                  let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
+                  let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
+                  let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
+                  let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
+                  let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
+
+                  let check_mask10 = _kor_mask16(check_mask0, check_mask1);
+                  let check_mask11 = _kor_mask16(check_mask2, check_mask3);
+                  let check_mask12 = _kor_mask16(check_mask4, check_mask5);
+                  let check_mask13 = _kor_mask16(check_mask6, check_mask7);
+                  let check_mask14 = _kor_mask16(check_mask8, check_mask9);
+
+                  let check_mask16 = _kor_mask16(check_mask10, check_mask11);
+                  let check_mask17 = _kor_mask16(check_mask12, check_mask13);
+                  let check_mask18 = _kor_mask16(check_mask16, check_mask17);
+                  temp_check[i] = _kor_mask16(check_mask18, check_mask14);
+              }
+
+              let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
+              let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
+              let check_mask2 = _kor_mask16(check_mask0, check_mask1);
+              let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
+
+              if check_mask3 > 0 { return 2560 }
+    } 
+*/
+    // 4,3
+    let c4f: i32  = check_patternfile4_dead_n(&pos, check_live4);
+    let c4r: i32  = check_patternrank4_dead_n(&pos, check_live4);
+    let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
+    let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
+
+    let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
+    let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
+    let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
+    let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
+
+    let n_c4: i32 = c4f + c4r + c4dl + c4dr;
+
+    if n_c4 > 1 { return -2048 }
+
+    if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 }
+
+/*
+    #[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern4_dead_avx512(&pos, check_live4); 
+
+              let mut count4: i32 = 0;
+
+              for i in 0..5 {
+                  for j in 0..4 {
+                      for k in 0..5 {
+                          count4 += _popcnt32(result[i][j][k] as i32);
+                      }
+                  }
+              }
+
+              if count4 > 1 { return -2048 }
+              else if count4 == 1 {
+
+              let result = check_pattern3_live_avx512(&pos, check_live4); 
+
+              let mut count3: i32 = 0;
+
+              for i in 0..3 {
+                  for j in 0..4 {
+                     for k in 0..5 {
+                        count3 += _popcnt32(result[i][j][k] as i32);
+                     }
+                  }
+              }
+
+              if count3 > 0 { return -3048 }
+              } 
+    }
+  */  
+    //---------------------------------------------------------------------------
+    
+    let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp);
+    let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp);
+    let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp);
+    let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp);
+    if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 }
+
+    if c3f + c3r + c3dl + c3dr > 1 { return -2048 }
+  /* 
+    #[target_feature(enable = "avx512f")]
+    unsafe { 
+              let result = check_pattern3_live_avx512(&pos, check_live4_opp); 
+
+              let mut count: i32 = 0;
+
+              for i in 0..3 {
+                  for j in 0..4 {
+                     for k in 0..5 {
+                        count += _popcnt32(result[i][j][k] as i32);
+                     }
+                  }
+              }
+
+              let c3f: i32  = check_patternfile3_live_n(&pos, check_live4_opp);
+              let c3r: i32  = check_patternrank3_live_n(&pos, check_live4_opp);
+              let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp);
+              let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp);
+
+              let mut count1: i32 = 0;
+
+              count1 = c3f+c3r+c3dl+c3dr;
+
+              if count1 > 1 { return -2048 }
+    } 
+*/
+    0 
+}
+
+
+fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+            let idx5 = sq + PATTERNFILE4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+            let idx5 = sq + PATTERNRANK4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+            let idx5 = sq + PATTERNDIAL4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 5..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+            let idx5 = sq + PATTERNDIAR4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 4..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false 
+}
+
+
+fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 4..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+
+/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool {
+
+    let last_move: Move = pos.p_last;
+
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+        }  
+    } 
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+            let idx5 = sq + PATTERNFILE4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool {
+
+    let last_move: Move = pos.p_last;
+
+    // let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+            let idx5 = sq + PATTERNRANK4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool {
+
+    let last_move: Move = pos.p_last;
+ 
+    //let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+            let idx5 = sq + PATTERNDIAL4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool {
+
+    let last_move: Move = pos.p_last;
+
+    //let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 4..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 5..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+            let idx5 = sq + PATTERNDIAR4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+        }  
+    } 
+
+    false 
+}
+*/
+
+fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+    let last_move: Move = pos.p_last;
+
+    let mut n: i32 = 0;
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; }
+        }  
+    } 
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+            let idx5 = sq + PATTERNFILE4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+    let last_move: Move = pos.p_last;
+
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+            let idx5 = sq + PATTERNRANK4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+    let last_move: Move = pos.p_last;
+ 
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 0..(FILE_SIZE - 5) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+            let idx5 = sq + PATTERNDIAL4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+    let last_move: Move = pos.p_last;
+
+    let mut n: i32 = 0;
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 4..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    for rk in 0..(RANK_SIZE - 5) {
+        for fl in 5..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+            let idx5 = sq + PATTERNDIAR4[5];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+            let val5 = pos.state[idx5 as usize];
+
+            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+        }  
+    } 
+
+    n
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool {
+
+    let current_side = side_opp(pos.p_turn);
+
+    let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) );
+
+    let answer_mask: __mmask16 = 0b01111111_11111111;
+
+    let coloridx = current_side as usize;
+
+    let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4];
+
+    for dir in 0..4 {
+        let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+        let boardf = _mm512_and_epi32(answer, board0);
+
+        temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+        for i in 1..5 {
+
+            let board1 = _mm512_rol_epi32(board0, i);
+
+            let boardf = _mm512_and_epi32(answer, board1);
+
+            temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+        }
+    }
+
+    let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); 
+    let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); 
+    let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]); 
+    let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]); 
+    let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]); 
+    let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); 
+    let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); 
+    let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]); 
+    let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]); 
+    let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]); 
+
+    let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); 
+    let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); 
+    let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); 
+    let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); 
+    let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9); 
+
+    let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); 
+    let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); 
+    let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17); 
+    let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14); 
+
+    if check_mask19 > 0 { return true } else { return false }
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool {
+
+    //let current_side = side_opp(sd);
+
+    let answer_color = _mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)|(1<<27) );
+    let answer_empty = _mm512_set1_epi32( (1<<31)|                               (1<<26) );
+    let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26));
+
+    let answer_mask: __mmask16 = 0b00000001_11111111;
+
+    //let coloridx = current_side as usize;
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4];
+
+    for dir in 0..4 {
+
+       let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+       let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+        let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd
+        let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty
+        let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+        temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+        for i in 1..4 { //only move 3 times
+
+            let board2 = _mm512_rol_epi32(board0, i);//rot sd
+            let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+            let boardf1 = _mm512_and_epi32(answer_color, board2);
+            let boardf2 = _mm512_and_epi32(answer_empty, board3);
+            let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+            temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+        }
+    }
+
+    let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); 
+    let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); 
+    let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]); 
+    let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]); 
+    let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); 
+    let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); 
+    let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]); 
+    let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]); 
+
+    let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); 
+    let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); 
+    let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); 
+    let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); 
+
+    let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); 
+    let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); 
+    let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17); 
+
+    if check_mask19 > 0 { return true } else { return false }
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] {
+
+    //let current_side = side_opp(sd);
+
+    let answer_color: [__m512i; 5] = [_mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)|(1<<27) ),
+                                      _mm512_set1_epi32( (1<<31)|        (1<<29)|(1<<28)|(1<<27) ),
+                                      _mm512_set1_epi32( (1<<31)|(1<<30)        |(1<<28)|(1<<27) ),
+                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)        |(1<<27) ),
+                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)         )];
+
+    let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ),
+                                     _mm512_set1_epi32(         (1<<30) ),
+                                     _mm512_set1_epi32(         (1<<29) ),
+                                     _mm512_set1_epi32(         (1<<28) ),
+                                     _mm512_set1_epi32(         (1<<27) )];
+
+    let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
+
+    let answer_mask: __mmask16 = 0b00000001_11111111;
+
+    //let coloridx = current_side as usize;
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5];
+
+    for pattern in 0..5 {
+
+        for dir in 0..4 {
+
+            let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+           let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+           let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
+           let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
+           let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+           temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+           for i in 1..5 { //only move 4 times
+
+               let board2 = _mm512_rol_epi32(board0, i);//rot sd
+               let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+               let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
+               let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
+               let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+               temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+               }
+          }
+    }
+
+    temp_mask 
+}
+
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] {
+
+    //let current_side = side_opp(sd);
+
+    let answer_color: [__m512i; 3] = [_mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)         ),
+                                      _mm512_set1_epi32(         (1<<30)|        (1<<28)|(1<<27) ),
+                                      _mm512_set1_epi32(         (1<<30)|(1<<29)        |(1<<27) )];
+
+    let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)|                         (1<<27) ),
+                                     _mm512_set1_epi32( (1<<31)|         (1<<29)|                (1<<26) ),
+                                     _mm512_set1_epi32( (1<<31)|                 (1<<28)|        (1<<26) )];
+
+    //let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
+    let answer: [__m512i; 3]       = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ),
+                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ),
+                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )];
+
+    let answer_mask: __mmask16 = 0b00000001_11111111;
+
+    //let coloridx = current_side as usize;
+    let coloridx = sd as usize;
+    let emptyidx = Color::Empty as usize;
+
+    let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3];
+
+    for pattern in 0..3 {
+
+        for dir in 0..4 {
+
+            let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+           let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+           let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
+           let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
+           let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+           temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
+
+           for i in 1..5 { //only move 4 times
+
+               let board2 = _mm512_rol_epi32(board0, i);//rot sd
+               let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+               let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
+               let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
+               let boardf  = _mm512_or_epi32(boardf1, boardf2);
+
+               temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
+               }
+          }
+    }
+
+    temp_mask 
+}
+
+fn check_patternfile5(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..RANK_SIZE {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNFILE4[0];
+            let idx1 = sq + PATTERNFILE4[1];
+            let idx2 = sq + PATTERNFILE4[2];
+            let idx3 = sq + PATTERNFILE4[3];
+            let idx4 = sq + PATTERNFILE4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patternrank5(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNRANK4[0];
+            let idx1 = sq + PATTERNRANK4[1];
+            let idx2 = sq + PATTERNRANK4[2];
+            let idx3 = sq + PATTERNRANK4[3];
+            let idx4 = sq + PATTERNRANK4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patterndial5(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 0..(FILE_SIZE - 4) {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAL4[0];
+            let idx1 = sq + PATTERNDIAL4[1];
+            let idx2 = sq + PATTERNDIAL4[2];
+            let idx3 = sq + PATTERNDIAL4[3];
+            let idx4 = sq + PATTERNDIAL4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false
+}
+
+fn check_patterndiar5(pos: &Pos, sd: Side) -> bool {
+
+    for rk in 0..(RANK_SIZE - 4) {
+        for fl in 4..FILE_SIZE {
+            let sq : Square = square_make(fl, rk);
+
+            let idx0 = sq + PATTERNDIAR4[0];
+            let idx1 = sq + PATTERNDIAR4[1];
+            let idx2 = sq + PATTERNDIAR4[2];
+            let idx3 = sq + PATTERNDIAR4[3];
+            let idx4 = sq + PATTERNDIAR4[4];
+
+            let val0 = pos.state[idx0 as usize];
+            let val1 = pos.state[idx1 as usize];
+            let val2 = pos.state[idx2 as usize];
+            let val3 = pos.state[idx3 as usize];
+            let val4 = pos.state[idx4 as usize];
+
+            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+        }  
+    } 
+
+    false 
+}
+
+fn main() {
+
+    loop
+    {
+
+    let start = Instant::now();
+
+    println!("Hello, this is connect 6!");
+
+    //unsafe { test_avx512(); }
+
+    let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
+
+    let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3];
+
+    let mut test1 = Pos {
+        state: test_state,
+        p_turn: Color::Black,
+        p_last: square_make(5,5),
+
+        bitboard: test_bitboard,
+    };
+
+    test1.init();
+
+    //pos_disp(&test1);
+
+    for i in 0..(FILE_SIZE*RANK_SIZE) {
+
+     //   println!("----------------------------------------\n\n\n\n");
+      //  println!("MOVE {}!!!!\n\n\n\n", i);
+
+
+    let mut d = 2;
+    let mut e = 4;
+
+    //if i < 6 { d = 1; e = 2; }
+
+    let next_move: Move = search(&test1, d, e);
+    //println!("next move is {}", next_move);
+    //println!("file is {}",  square_file(next_move));
+    //println!("rank is {}",  square_rank(next_move));
+
+    test1.do_move(next_move);
+
+    //pos_disp(&test1);
+
+    if pos_is_end(&test1) { 
+        
+        println!("Game over!!!!!!");
+        println!("MOVE {}!!!!\n", i);
+        //pos_disp(&test1);
+        
+        break; }
+    }
+
+
+    let duration = start.elapsed();
+
+    println!("Time elapsed in expensive_function() is: {:?}", duration);
+    }
+
+
+}

From b4023d1bdb6fd4e0a3c1c5cf282724567953c14e Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 13:32:07 +0000
Subject: [PATCH 15/25] mask_add: ss,sd

---
 examples/connect5_14_avx512.rs | 2100 --------------------------------
 1 file changed, 2100 deletions(-)
 delete mode 100644 examples/connect5_14_avx512.rs

diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs
deleted file mode 100644
index 696adfc70c..0000000000
--- a/examples/connect5_14_avx512.rs
+++ /dev/null
@@ -1,2100 +0,0 @@
-#![feature(stdsimd, avx512_target_feature)]
-
-#[cfg(target_arch = "x86")]
-use {core_arch::arch::x86::*};
-#[cfg(target_arch = "x86_64")]
-use {core_arch::arch::x86_64::*};
-
-
-use rand::seq::SliceRandom;
-use rand::thread_rng;
-use rand::Rng;
-
-use std::cmp;
-
-use std::time::{Duration, Instant};
-
-// types
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub enum Color {
-    Black,
-    White,
-    Empty,
-}
-
-type Square = i32; 
-type Move = i32;
-type Side = Color;
-type Piece = Color;
-
-// constants
-
-const FILE_SIZE: i32 = 15;
-const RANK_SIZE: i32 = 15;
-const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4;
-
-const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100);
-const MOVE_NONE: Move = -1;
-const SCORE_NONE: i32 = -EVAL_INF - 1;
-
-const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1],
-                                   [-3, -2, -1,  1],
-                                   [-2, -1,  1,  2],
-                                   [-1,  1,  2,  3],
-                                   [ 1,  2,  3,  4],
-
-                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)],
-                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)],
-                             [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)],
-                             [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)],
-                             [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)],
-
-                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)],
-                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)],
-                             [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)],
-                             [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)],
-                             [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)],
-
-                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)],
-                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)],
-                             [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)],
-                             [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)],
-                             [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)]
-                           ];
-
-const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
-const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)];
-const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)];
-const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)];
-
-const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17],
-
-                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
-                                         0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30,
-                                         0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29,
-                                         0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28,
-                                         0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
-                                         0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
-                                         0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
-                                         0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
-                                         0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
-                                         0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
-                                         0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
-                                         0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20,
-                                         0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19,
-                                         0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18,
-                                         0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
-
-                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0,     0,     0,     0,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0,     0,     0,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0,     0,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
-                                         0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
-                                         0, 0, 0, 0, 0,     1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20,
-                                         0, 0, 0, 0, 0,     0,     1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19,
-                                         0, 0, 0, 0, 0,     0,     0,     1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18,
-                                         0, 0, 0, 0, 0,     0,     0,     0,     1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
-
-                                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                         0, 0, 0, 0, 0,     0,     0,     0,     1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
-                                         0, 0, 0, 0, 0,     0,     0,     1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 0,     0,     1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 0,     1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
-                                         0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0,
-                                         0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0,     0,
-                                         0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0,     0,     0,
-                                         0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0,     0,     0,     0] 
-                                        ];
-
-const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                       0, 0, 0, 0,    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-                                       0, 0, 0, 0,    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-                                       0, 0, 0, 0,    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-                                       0, 0, 0, 0,    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                                       0, 0, 0, 0,    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                                       0, 0, 0, 0,    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-                                       0, 0, 0, 0,    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-                                       0, 0, 0, 0,    9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                       0, 0, 0, 0,    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-                                       0, 0, 0, 0,    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-                                       0, 0, 0, 0,    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-                                       0, 0, 0, 0,    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-                                       0, 0, 0, 0,    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
-
-                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
-
-                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0,    10,  9, 8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,  0,
-                                       0, 0, 0, 0,    11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,  0,
-                                       0, 0, 0, 0,    12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,  0,
-                                       0, 0, 0, 0,    13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,  0,
-                                       0, 0, 0, 0,    14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0,
-                                       0, 0, 0, 0,    15, 14, 13, 6,  11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,
-                                       0, 0, 0, 0,    16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,
-                                       0, 0, 0, 0,    17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,
-                                       0, 0, 0, 0,    18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,
-                                       0, 0, 0, 0,    19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,
-                                       0, 0, 0, 0,    20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,  6,
-                                       0, 0, 0, 0,    0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,  7,
-                                       0, 0, 0, 0,    0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,  8,
-                                       0, 0, 0, 0,    0,  0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9
-                                       0, 0, 0, 0,    0,  0,  0,  0,  20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10],
-
-                                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       0, 0, 0, 0,    0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                                       0, 0, 0, 0,    0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
-                                       0, 0, 0, 0,    0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                                       0, 0, 0, 0,    0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-                                       0, 0, 0, 0,    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                                       0, 0, 0, 0,    1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                                       0, 0, 0, 0,    2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                                       0, 0, 0, 0,    3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17,
-                                       0, 0, 0, 0,    4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18,
-                                       0, 0, 0, 0,    5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-                                       0, 0, 0, 0,    6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
-                                       0, 0, 0, 0,    7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,
-                                       0, 0, 0, 0,    8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,
-                                       0, 0, 0, 0,    9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,  0,
-                                       0, 0, 0, 0,    10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,  0,  0,  0,  0]
-                                ];
-
-// variables
-
-static mut Endgame: bool = false;
-
-// structures
-
-pub struct Pos { // position
-    state: [Color; SQUARE_SIZE as usize],
-    p_turn: Side,
-    p_last: Move,
-
-    bitboard: [[[i32; 20]; 4]; 3], 
-
-}
-
-impl Pos {
-
-    pub fn init(&mut self) { // starting position
-        for i in 0..SQUARE_SIZE as usize {
-            self.state[i] = Color::Empty;
-        }
-
-        self.p_turn = Color::Black;
-        self.p_last = square_make(0, 0);
-
-        //--------------------------------------------
-
-        for i in 0..4 {
-            for j in 0..20 { 
-                self.bitboard[Color::Black as usize][i][j] = 0; 
-            }
-        }
-
-        for i in 0..4 {
-            for j in 0..20 { 
-                self.bitboard[Color::White as usize][i][j] = 0;
-            }
-        }
-
-        for i in 0..2 {
-            for j in 0..20 { 
-                self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); 
-            }
-        }
-        
-        self.bitboard[Color::Empty as usize][2][0]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
-        self.bitboard[Color::Empty as usize][2][1]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
-        self.bitboard[Color::Empty as usize][2][2]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
-        self.bitboard[Color::Empty as usize][2][3]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
-        self.bitboard[Color::Empty as usize][2][4]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
-        self.bitboard[Color::Empty as usize][2][5]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
-        self.bitboard[Color::Empty as usize][2][6]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
-        self.bitboard[Color::Empty as usize][2][7]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
-        self.bitboard[Color::Empty as usize][2][8]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
-        self.bitboard[Color::Empty as usize][2][9]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
-        self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
-        self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
-        self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
-        self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
-        self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
-        self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
-        self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
-        self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
-        self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
-        self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
-
-        self.bitboard[Color::Empty as usize][3][0]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
-        self.bitboard[Color::Empty as usize][3][1]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
-        self.bitboard[Color::Empty as usize][3][2]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
-        self.bitboard[Color::Empty as usize][3][3]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
-        self.bitboard[Color::Empty as usize][3][4]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
-        self.bitboard[Color::Empty as usize][3][5]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
-        self.bitboard[Color::Empty as usize][3][6]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
-        self.bitboard[Color::Empty as usize][3][7]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
-        self.bitboard[Color::Empty as usize][3][8]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
-        self.bitboard[Color::Empty as usize][3][9]  = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
-        self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
-        self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
-        self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
-        self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
-        self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
-        self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
-        self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
-        self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
-        self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
-        self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
-    } 
-
-    pub fn do_move(&mut self, mv: Move) {
-
-        let atk: Side = self.p_turn;
-        let def: Side = side_opp(atk);
-
-        match self.p_turn {
-            Color::Black => { self.state[mv as usize] = Color::Black;
-
-                              for i in 0..4 {
-                                  self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
-                                  self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
-                              }
-            },
-
-            Color::White => { self.state[mv as usize] = Color::White;
-
-                              for i in 0..4 {
-                                  self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
-                                  self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
-                              }
-            },
-
-            Color::Empty => {},
-        }
-
-        self.p_last = mv;
-
-        self.p_turn = def; 
-    }
-
-    fn turn(&self) -> Side {
-        self.p_turn
-    }
-
-    pub fn can_play(&self, from: Square) -> bool {
-
-        if self.state[from as usize] == Color::Empty { true } else { false }
-    }
-
-    pub fn count(&self, pc: Piece) -> i32 {
-        
-        let mut n: i32 = 0;
-
-        for rk in 0..RANK_SIZE {
-            for fl in 0..FILE_SIZE {
-                let sq: Square = square_make(fl, rk);
-                if self.state[sq as usize] == pc { n += 1; }
-            }
-        }
-        n
-    }
-}
-
-pub struct List {  // legal move list
-    p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
-    p_size: i32,
-}
-
-impl List {
-    
-    pub fn clear(&mut self) {
-        self.p_size = 0;
-    }
-
-    pub fn add(&mut self, mv: Move) {
-        self.p_move[self.p_size as usize] = mv;
-        self.p_size += 1;
-    }
-
-    pub fn size(&self) -> i32 {
-        self.p_size
-    }
-
-    pub fn shuffle(&mut self) {
-
-        let mut rng = thread_rng();
-
-        let num = self.p_size;
-        
-        let mut new_move: Vec<Move> = vec![];
-
-        for x in 0..(num as usize) {
-            new_move.push(self.p_move[x]);
-        }
-
-        new_move.shuffle(&mut rng);
-
-        for x in 0..(self.p_size as usize) {
-            self.p_move[x] = new_move[x];
-        }
-    }
-
-    //pub fn move(&self, i: i32) -> Move {
-    //    self.p_move[i as usize]
-    //}
-}
-
-// functions
-//
-fn square_make(fl: i32, rk: i32) -> Square {
-    (rk + 4) * (FILE_SIZE + 4) + (fl + 4)
-}
-
-fn square_file(sq: Square) -> i32 {
-    sq % (FILE_SIZE + 4) - 4
-}
-
-fn square_rank(sq: Square) -> i32 {
-    sq / (FILE_SIZE + 4) - 4
-}
-
-fn side_opp(sd: Side) -> Side {
-
-    let mut out: Side; 
-
-    match sd {
-        Side::White => out = Side::Black,
-        Side::Black => out = Side::White,
-        Side::Empty => panic!(""),
-    }
-
-    out
-}
-
-fn pos_is_winner(pos : &Pos) -> bool {
-
-    let current_side = side_opp(pos.p_turn);
-
-    let mut found : bool = true;
-    
-    for x in 0..20 {
-        for y in 0..4 {
-
-            found = true;
-
-            let adj = pos.p_last + ENDCHECK[x][y];
-      
-            if pos.state[adj as usize] != current_side { found = false; break }
-        }
-        if found == true { break; } 
-    }
-
-    found
-}
-
-fn pos_is_winner_scan(pos : &Pos) -> bool {
-
-   let current_side = side_opp(pos.p_turn);
-
-   if check_patternfile5(&pos, current_side) ||
-      check_patternrank5(&pos, current_side) ||
-      check_patterndial5(&pos, current_side) ||
-      check_patterndiar5(&pos, current_side) { return true }
-    
-   false
-}
-
-fn pos_is_draw(pos : &Pos) -> bool {
-
-    
-    let mut found : bool = true;
-        
-    for rk in 0..RANK_SIZE {
-        for fl in 0..FILE_SIZE {
-
-            let sq: Square = square_make(fl, rk);
-            if  pos.can_play(sq) {
-                found = false;
-                break;
-            }
-
-        if found == false { break;}
-        }
-    }
-/*
-    
-    let mut test: bool = false;
-
-    if pos.bitboard[Color::Empty as usize][0][0] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][1] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][2] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][3] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][4] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][5] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][6] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][7] == 0 &&
-       pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; }
-*/
-   //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); }
-
-    let mut out: bool = false;
-
-    //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; }
-    if found == true && !pos_is_winner_scan(pos) { out = true; }
-
-    out
-}
-
-fn pos_is_end(pos : &Pos) -> bool {
-
-    if pos_is_winner_scan(pos) || pos_is_draw(pos) { 
-        true 
-    } else {
-        false
-    }
-}
-
-fn pos_disp(pos: &Pos) {
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..FILE_SIZE {
-
-            let sq: Square = square_make(fl, rk);
-
-            match pos.state[sq as usize] {
-                Color::Black => print!("# "),
-                Color::White => print!("O "),
-                Color::Empty => print!("- "),
-            }
-        }
-
-        println!("");    
-    }
-
-    match pos.turn() {
-        Color::Black => println!("black to play"),
-        Color::White => println!("white to play"),
-        _ => (),
-    }
-}
-
-fn gen_moves(list : &mut List, pos: &Pos) {
-
-    list.clear();
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-            if pos.can_play(sq) { list.add(sq); }
-        }
-    }
-
-}
-
-fn search(pos : &Pos, depth: i32, endgame: i32) -> Move {
-
-    //println!("call search");
-
-    let mut new_depth = depth;
-
-    let empties: i32 = pos.count(Color::Empty);
-    if (empties <= endgame || new_depth > empties ) { new_depth = empties; }
-
-    if(new_depth == empties) { unsafe { Endgame = true; } }
-
-    search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0)
-
-}
-
-fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 {
-
-
-    assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
-    //println!("call search_real");
-    //println!("depth = {}", depth);
-    //println!("ply   = {}", ply);
-    // leaf?
-
-    //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); }
-    if pos_is_winner_scan(&pos) { return -EVAL_INF + ply }
-    //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply }
-
-
-    if pos_is_draw(&pos) { return 0 }
-
-    if depth == 0 {
-         return eval(&pos, ply)
-    }
-
-    let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize];
-
-    let mut list = List {
-    p_move: p_move_new,
-    p_size: 0,
-    };
-
-    let mut bm: Move = MOVE_NONE;
-    let mut bs: i32  = SCORE_NONE;
-
-    gen_moves(&mut list, &pos);
-
-    // move loop
-
-    if ply == 0 { list.shuffle(); }
-
-    for i in 0..list.size() {
-
-        if bs < beta { 
-
-        let mv: Move = list.p_move[i as usize];
-
-        let mut new_pos = Pos {
-            state: pos.state,
-            p_turn: pos.p_turn,
-            p_last: pos.p_last,
-
-            bitboard: pos.bitboard,
-        };
-
-        //println!("p_last = {}", new_pos.p_last);
-
-        new_pos.do_move(mv);
-
-        //println!("After do _move p_last = {}", new_pos.p_last);
-
-        let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1);
-
-        
-        //if sc >= 410 || sc <= -410 {
-        //println!("sc = {} depth = {}-------------------------------", sc, depth);
-
-        //pos_disp(&new_pos);
-        //}
-        
-
-        if sc > bs { bm = mv; bs = sc; }
-
-        }
-    }
-
-    assert!(bm != MOVE_NONE);
-    assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
-
-    if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere
-    //bs
-}
-
-fn result(pos: &Pos) -> i32 {
-
-   if(pos_is_winner_scan(pos)) {
-       -(FILE_SIZE*RANK_SIZE*100)
-   } else {
-       0
-   }
-}
-
-
-fn eval(pos: &Pos, ply: i32) -> i32 {
-
-    let atk: Side = pos.turn();
-    let def: Side = side_opp(atk);
-
-    //let mut sc: i32 = 0;
-
-    let check_live4: Side = def; 
-    let check_live4_opp: Side = atk; 
-
-    //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
-    //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
-/*
-    if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self ");  pos_disp(&pos); }
-    if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp ");  pos_disp(&pos); }
-
-    #[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern4_dead_avx512(&pos, check_live4_opp); 
-
-              let mut temp_check: [__mmask16; 5] = [0; 5];
-
-              for i in 0..5 {
-
-              let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
-              let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
-              let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
-              let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
-              let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
-              let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
-              let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
-              let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
-              let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
-              let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
-
-              let check_mask10 = _kor_mask16(check_mask0, check_mask1);
-              let check_mask11 = _kor_mask16(check_mask2, check_mask3);
-              let check_mask12 = _kor_mask16(check_mask4, check_mask5);
-              let check_mask13 = _kor_mask16(check_mask6, check_mask7);
-              let check_mask14 = _kor_mask16(check_mask8, check_mask9);
-
-              let check_mask16 = _kor_mask16(check_mask10, check_mask11);
-              let check_mask17 = _kor_mask16(check_mask12, check_mask13);
-              let check_mask18 = _kor_mask16(check_mask16, check_mask17);
-              temp_check[i] = _kor_mask16(check_mask18, check_mask14);
-
-              }
-
-              let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
-              let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
-              let check_mask2 = _kor_mask16(check_mask0, check_mask1);
-              let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
-
-              let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp);
-
-              let mut test2: bool = true;
-              
-              if check_mask3 > 0 { test2 = true; } else { test2 = false; }
-
-              if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp ");  pos_disp(&pos); }
-    } 
-
-    #[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern4_dead_avx512(&pos, check_live4); 
-
-              let mut count: i32 = 0;
-
-              for i in 0..5 {
-                  for j in 0..4 {
-                      for k in 0..5 {
-                          count += _popcnt32(result[i][j][k] as i32);
-                      }
-                  }
-              }
-
-              let c4f: i32  = check_patternfile4_dead_n(&pos, check_live4);
-              let c4r: i32  = check_patternrank4_dead_n(&pos, check_live4);
-              let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
-              let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
-
-
-              if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count);  pos_disp(&pos); }
-    }
-
-    #[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern3_live_avx512(&pos, check_live4); 
-
-              let mut count: i32 = 0;
-
-              for i in 0..3 {
-                  for j in 0..4 {
-                     for k in 0..5 {
-                        count += _popcnt32(result[i][j][k] as i32);
-                     }
-                  }
-              }
-
-              let c3f: i32  = check_patternfile3_live_n(&pos, check_live4);
-              let c3r: i32  = check_patternrank3_live_n(&pos, check_live4);
-              let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
-              let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
-
-              let mut count1: i32 = 0;
-
-              count1 = c3f+c3r+c3dl+c3dr;
-
-              if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count);  pos_disp(&pos); }
-    } 
-*/
-
-    if check_patternfile4_once(&pos, check_live4) || 
-       check_patternrank4_once(&pos, check_live4) ||
-       check_patterndial4_once(&pos, check_live4) ||
-       check_patterndiar4_once(&pos, check_live4) { return -4096 }
-
-    //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 }
-    
-    if check_patternfile4_once(&pos, check_live4_opp) || 
-       check_patternrank4_once(&pos, check_live4_opp) ||
-       check_patterndial4_once(&pos, check_live4_opp) ||
-       check_patterndiar4_once(&pos, check_live4_opp) { return 2560 }
-
-    //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 }
-
-    if check_patternfile4_dead(&pos, check_live4_opp) || 
-       check_patternrank4_dead(&pos, check_live4_opp) ||
-       check_patterndial4_dead(&pos, check_live4_opp) ||
-       check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 }
-
-    /*#[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern4_dead_avx512(&pos, check_live4_opp); 
-
-              let mut temp_check: [__mmask16; 5] = [0; 5];
-
-              for i in 0..5 {
-                  let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
-                  let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
-                  let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
-                  let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
-                  let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
-                  let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
-                  let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
-                  let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
-                  let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
-                  let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
-
-                  let check_mask10 = _kor_mask16(check_mask0, check_mask1);
-                  let check_mask11 = _kor_mask16(check_mask2, check_mask3);
-                  let check_mask12 = _kor_mask16(check_mask4, check_mask5);
-                  let check_mask13 = _kor_mask16(check_mask6, check_mask7);
-                  let check_mask14 = _kor_mask16(check_mask8, check_mask9);
-
-                  let check_mask16 = _kor_mask16(check_mask10, check_mask11);
-                  let check_mask17 = _kor_mask16(check_mask12, check_mask13);
-                  let check_mask18 = _kor_mask16(check_mask16, check_mask17);
-                  temp_check[i] = _kor_mask16(check_mask18, check_mask14);
-              }
-
-              let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
-              let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
-              let check_mask2 = _kor_mask16(check_mask0, check_mask1);
-              let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
-
-              if check_mask3 > 0 { return 2560 }
-    } 
-*/
-    // 4,3
-    let c4f: i32  = check_patternfile4_dead_n(&pos, check_live4);
-    let c4r: i32  = check_patternrank4_dead_n(&pos, check_live4);
-    let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
-    let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
-
-    let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
-    let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
-    let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
-    let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
-
-    let n_c4: i32 = c4f + c4r + c4dl + c4dr;
-
-    if n_c4 > 1 { return -2048 }
-
-    if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 }
-
-/*
-    #[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern4_dead_avx512(&pos, check_live4); 
-
-              let mut count4: i32 = 0;
-
-              for i in 0..5 {
-                  for j in 0..4 {
-                      for k in 0..5 {
-                          count4 += _popcnt32(result[i][j][k] as i32);
-                      }
-                  }
-              }
-
-              if count4 > 1 { return -2048 }
-              else if count4 == 1 {
-
-              let result = check_pattern3_live_avx512(&pos, check_live4); 
-
-              let mut count3: i32 = 0;
-
-              for i in 0..3 {
-                  for j in 0..4 {
-                     for k in 0..5 {
-                        count3 += _popcnt32(result[i][j][k] as i32);
-                     }
-                  }
-              }
-
-              if count3 > 0 { return -3048 }
-              } 
-    }
-  */  
-    //---------------------------------------------------------------------------
-    
-    let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp);
-    let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp);
-    let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp);
-    let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp);
-    if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 }
-
-    if c3f + c3r + c3dl + c3dr > 1 { return -2048 }
-  /* 
-    #[target_feature(enable = "avx512f")]
-    unsafe { 
-              let result = check_pattern3_live_avx512(&pos, check_live4_opp); 
-
-              let mut count: i32 = 0;
-
-              for i in 0..3 {
-                  for j in 0..4 {
-                     for k in 0..5 {
-                        count += _popcnt32(result[i][j][k] as i32);
-                     }
-                  }
-              }
-
-              let c3f: i32  = check_patternfile3_live_n(&pos, check_live4_opp);
-              let c3r: i32  = check_patternrank3_live_n(&pos, check_live4_opp);
-              let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp);
-              let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp);
-
-              let mut count1: i32 = 0;
-
-              count1 = c3f+c3r+c3dl+c3dr;
-
-              if count1 > 1 { return -2048 }
-    } 
-*/
-    0 
-}
-
-
-fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-            let idx5 = sq + PATTERNFILE4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-            let idx5 = sq + PATTERNRANK4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-            let idx5 = sq + PATTERNDIAL4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 5..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-            let idx5 = sq + PATTERNDIAR4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 4..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false 
-}
-
-
-fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
-    let mut n: i32 = 0;
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 4..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-            if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
-            if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
-            if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-
-/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool {
-
-    let last_move: Move = pos.p_last;
-
-    let mut n: i32 = 0;
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-        }  
-    } 
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-            let idx5 = sq + PATTERNFILE4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool {
-
-    let last_move: Move = pos.p_last;
-
-    // let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-            let idx5 = sq + PATTERNRANK4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool {
-
-    let last_move: Move = pos.p_last;
- 
-    //let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-            let idx5 = sq + PATTERNDIAL4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool {
-
-    let last_move: Move = pos.p_last;
-
-    //let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 4..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 5..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-            let idx5 = sq + PATTERNDIAR4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
-        }  
-    } 
-
-    false 
-}
-*/
-
-fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 {
-
-    let last_move: Move = pos.p_last;
-
-    let mut n: i32 = 0;
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; }
-        }  
-    } 
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-            let idx5 = sq + PATTERNFILE4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 {
-
-    let last_move: Move = pos.p_last;
-
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-            let idx5 = sq + PATTERNRANK4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 {
-
-    let last_move: Move = pos.p_last;
- 
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 0..(FILE_SIZE - 5) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-            let idx5 = sq + PATTERNDIAL4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 {
-
-    let last_move: Move = pos.p_last;
-
-    let mut n: i32 = 0;
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 4..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    for rk in 0..(RANK_SIZE - 5) {
-        for fl in 5..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-            let idx5 = sq + PATTERNDIAR4[5];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-            let val5 = pos.state[idx5 as usize];
-
-            if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
-            if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
-        }  
-    } 
-
-    n
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool {
-
-    let current_side = side_opp(pos.p_turn);
-
-    let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) );
-
-    let answer_mask: __mmask16 = 0b01111111_11111111;
-
-    let coloridx = current_side as usize;
-
-    let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4];
-
-    for dir in 0..4 {
-        let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
-        let boardf = _mm512_and_epi32(answer, board0);
-
-        temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
-        for i in 1..5 {
-
-            let board1 = _mm512_rol_epi32(board0, i);
-
-            let boardf = _mm512_and_epi32(answer, board1);
-
-            temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-        }
-    }
-
-    let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); 
-    let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); 
-    let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]); 
-    let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]); 
-    let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]); 
-    let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); 
-    let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); 
-    let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]); 
-    let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]); 
-    let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]); 
-
-    let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); 
-    let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); 
-    let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); 
-    let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); 
-    let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9); 
-
-    let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); 
-    let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); 
-    let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17); 
-    let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14); 
-
-    if check_mask19 > 0 { return true } else { return false }
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool {
-
-    //let current_side = side_opp(sd);
-
-    let answer_color = _mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)|(1<<27) );
-    let answer_empty = _mm512_set1_epi32( (1<<31)|                               (1<<26) );
-    let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26));
-
-    let answer_mask: __mmask16 = 0b00000001_11111111;
-
-    //let coloridx = current_side as usize;
-    let coloridx = sd as usize;
-    let emptyidx = Color::Empty as usize;
-
-    let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4];
-
-    for dir in 0..4 {
-
-       let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
-       let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
-        let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd
-        let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty
-        let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-        temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
-        for i in 1..4 { //only move 3 times
-
-            let board2 = _mm512_rol_epi32(board0, i);//rot sd
-            let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
-            let boardf1 = _mm512_and_epi32(answer_color, board2);
-            let boardf2 = _mm512_and_epi32(answer_empty, board3);
-            let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-            temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-        }
-    }
-
-    let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); 
-    let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); 
-    let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]); 
-    let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]); 
-    let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); 
-    let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); 
-    let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]); 
-    let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]); 
-
-    let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); 
-    let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); 
-    let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); 
-    let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); 
-
-    let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); 
-    let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); 
-    let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17); 
-
-    if check_mask19 > 0 { return true } else { return false }
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] {
-
-    //let current_side = side_opp(sd);
-
-    let answer_color: [__m512i; 5] = [_mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)|(1<<27) ),
-                                      _mm512_set1_epi32( (1<<31)|        (1<<29)|(1<<28)|(1<<27) ),
-                                      _mm512_set1_epi32( (1<<31)|(1<<30)        |(1<<28)|(1<<27) ),
-                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)        |(1<<27) ),
-                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)         )];
-
-    let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ),
-                                     _mm512_set1_epi32(         (1<<30) ),
-                                     _mm512_set1_epi32(         (1<<29) ),
-                                     _mm512_set1_epi32(         (1<<28) ),
-                                     _mm512_set1_epi32(         (1<<27) )];
-
-    let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
-
-    let answer_mask: __mmask16 = 0b00000001_11111111;
-
-    //let coloridx = current_side as usize;
-    let coloridx = sd as usize;
-    let emptyidx = Color::Empty as usize;
-
-    let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5];
-
-    for pattern in 0..5 {
-
-        for dir in 0..4 {
-
-            let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
-           let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
-           let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
-           let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
-           let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-           temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
-           for i in 1..5 { //only move 4 times
-
-               let board2 = _mm512_rol_epi32(board0, i);//rot sd
-               let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
-               let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
-               let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
-               let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-               temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-               }
-          }
-    }
-
-    temp_mask 
-}
-
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] {
-
-    //let current_side = side_opp(sd);
-
-    let answer_color: [__m512i; 3] = [_mm512_set1_epi32(         (1<<30)|(1<<29)|(1<<28)         ),
-                                      _mm512_set1_epi32(         (1<<30)|        (1<<28)|(1<<27) ),
-                                      _mm512_set1_epi32(         (1<<30)|(1<<29)        |(1<<27) )];
-
-    let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)|                         (1<<27) ),
-                                     _mm512_set1_epi32( (1<<31)|         (1<<29)|                (1<<26) ),
-                                     _mm512_set1_epi32( (1<<31)|                 (1<<28)|        (1<<26) )];
-
-    //let answer       = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
-    let answer: [__m512i; 3]       = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ),
-                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ),
-                                      _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )];
-
-    let answer_mask: __mmask16 = 0b00000001_11111111;
-
-    //let coloridx = current_side as usize;
-    let coloridx = sd as usize;
-    let emptyidx = Color::Empty as usize;
-
-    let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3];
-
-    for pattern in 0..3 {
-
-        for dir in 0..4 {
-
-            let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
-           let  board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
-           let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
-           let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
-           let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-           temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
-
-           for i in 1..5 { //only move 4 times
-
-               let board2 = _mm512_rol_epi32(board0, i);//rot sd
-               let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
-               let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
-               let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
-               let boardf  = _mm512_or_epi32(boardf1, boardf2);
-
-               temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
-               }
-          }
-    }
-
-    temp_mask 
-}
-
-fn check_patternfile5(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..RANK_SIZE {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNFILE4[0];
-            let idx1 = sq + PATTERNFILE4[1];
-            let idx2 = sq + PATTERNFILE4[2];
-            let idx3 = sq + PATTERNFILE4[3];
-            let idx4 = sq + PATTERNFILE4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patternrank5(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNRANK4[0];
-            let idx1 = sq + PATTERNRANK4[1];
-            let idx2 = sq + PATTERNRANK4[2];
-            let idx3 = sq + PATTERNRANK4[3];
-            let idx4 = sq + PATTERNRANK4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patterndial5(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 0..(FILE_SIZE - 4) {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAL4[0];
-            let idx1 = sq + PATTERNDIAL4[1];
-            let idx2 = sq + PATTERNDIAL4[2];
-            let idx3 = sq + PATTERNDIAL4[3];
-            let idx4 = sq + PATTERNDIAL4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false
-}
-
-fn check_patterndiar5(pos: &Pos, sd: Side) -> bool {
-
-    for rk in 0..(RANK_SIZE - 4) {
-        for fl in 4..FILE_SIZE {
-            let sq : Square = square_make(fl, rk);
-
-            let idx0 = sq + PATTERNDIAR4[0];
-            let idx1 = sq + PATTERNDIAR4[1];
-            let idx2 = sq + PATTERNDIAR4[2];
-            let idx3 = sq + PATTERNDIAR4[3];
-            let idx4 = sq + PATTERNDIAR4[4];
-
-            let val0 = pos.state[idx0 as usize];
-            let val1 = pos.state[idx1 as usize];
-            let val2 = pos.state[idx2 as usize];
-            let val3 = pos.state[idx3 as usize];
-            let val4 = pos.state[idx4 as usize];
-
-            if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
-        }  
-    } 
-
-    false 
-}
-
-fn main() {
-
-    loop
-    {
-
-    let start = Instant::now();
-
-    println!("Hello, this is connect 6!");
-
-    //unsafe { test_avx512(); }
-
-    let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
-
-    let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3];
-
-    let mut test1 = Pos {
-        state: test_state,
-        p_turn: Color::Black,
-        p_last: square_make(5,5),
-
-        bitboard: test_bitboard,
-    };
-
-    test1.init();
-
-    //pos_disp(&test1);
-
-    for i in 0..(FILE_SIZE*RANK_SIZE) {
-
-     //   println!("----------------------------------------\n\n\n\n");
-      //  println!("MOVE {}!!!!\n\n\n\n", i);
-
-
-    let mut d = 2;
-    let mut e = 4;
-
-    //if i < 6 { d = 1; e = 2; }
-
-    let next_move: Move = search(&test1, d, e);
-    //println!("next move is {}", next_move);
-    //println!("file is {}",  square_file(next_move));
-    //println!("rank is {}",  square_rank(next_move));
-
-    test1.do_move(next_move);
-
-    //pos_disp(&test1);
-
-    if pos_is_end(&test1) { 
-        
-        println!("Game over!!!!!!");
-        println!("MOVE {}!!!!\n", i);
-        //pos_disp(&test1);
-        
-        break; }
-    }
-
-
-    let duration = start.elapsed();
-
-    println!("Time elapsed in expensive_function() is: {:?}", duration);
-    }
-
-
-}

From 38c4f1da99c63acfed6985623074331d1184da5c Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 14:28:45 +0000
Subject: [PATCH 16/25] mm_mask_sub: ss,sd; mm_mask_mul: ss,sd; mm_mask_div:
 ss,sd

---
 crates/core_arch/avx512f.md         |  24 +-
 crates/core_arch/src/x86/avx512f.rs | 360 ++++++++++++++++++++++++++++
 2 files changed, 372 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 01328a43f8..1f0f9c1460 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1252,8 +1252,8 @@
   * [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236)
   * [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
   * [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
-  * [ ] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
-  * [ ] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
+  * [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
+  * [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
   * [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236)
   * [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236)
@@ -1296,8 +1296,8 @@
   * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
   * [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
   * [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
-  * [ ] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
-  * [ ] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
+  * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
+  * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
   * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
   * [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
   * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
@@ -1318,8 +1318,8 @@
   * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
   * [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
   * [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
-  * [ ] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
-  * [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
+  * [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
+  * [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
   * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
   * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
   * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
@@ -1330,8 +1330,8 @@
   * [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236)
   * [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
   * [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
-  * [ ] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
+  * [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
+  * [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236)
@@ -1374,8 +1374,8 @@
   * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
   * [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
   * [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
-  * [ ] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
+  * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
+  * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
   * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
   * [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
   * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
@@ -1394,8 +1394,8 @@
   * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
   * [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
   * [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
-  * [ ] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
+  * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
+  * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
   * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
   * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
   * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7804a8ce44..852abfe349 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18487,6 +18487,216 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     transmute(r)
 }
 
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta - extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta * extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut add: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut add: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut add: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut add: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        add = extracta / extractb;
+    }
+    let r = simd_insert(a, 0, add);
+    transmute(r)
+}
+
 /// Equal
 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
 /// Less-than
@@ -28122,4 +28332,154 @@ mod tests {
         let e = _mm_set_pd(1., 6.);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sub_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sub_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_mul_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_mul_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_div_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_div_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
 }

From 07a2ee9a1b35d05a6c6adab21c486c4453360a48 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 15:40:40 +0000
Subject: [PATCH 17/25] add_round: ss,sd; sub_round: ss,sd; mul_round: ss,sd;
 div_round: ss,sd

---
 crates/core_arch/avx512f.md         |   44 +-
 crates/core_arch/src/x86/avx512f.rs | 1053 +++++++++++++++++++++++++++
 crates/core_arch/src/x86/mod.rs     |   18 +
 3 files changed, 1093 insertions(+), 22 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 1f0f9c1460..cd93fa26b9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1133,8 +1133,8 @@
   * [x] [`_mm512_zextps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextps256_ps512&expand=5236)
   * [x] [`_mm512_zextsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi128_si512&expand=5236)
   * [x] [`_mm512_zextsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi256_si512&expand=5236)
-  * [ ] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236)
-  * [ ] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236)
+  * [x] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236)
+  * [x] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236)
   * [x] [`_mm_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236)
   * [x] [`_mm_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236)
   * [x] [`_mm_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236)
@@ -1200,8 +1200,8 @@
   * [ ] [`_mm_cvtu32_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=5236)
   * [ ] [`_mm_cvtu64_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=5236)
   * [ ] [`_mm_cvtu64_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=5236)
-  * [ ] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236)
-  * [ ] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236)
+  * [x] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236)
+  * [x] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236)
   * [ ] [`_mm_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_sd&expand=5236)
   * [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236)
@@ -1238,8 +1238,8 @@
   * [ ] [`_mm_mask3_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_ss&expand=5236)
   * [ ] [`_mm_mask3_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sd&expand=5236)
   * [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236)
-  * [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
-  * [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
+  * [x] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
+  * [x] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
   * [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
   * [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
   * [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236)
@@ -1250,8 +1250,8 @@
   * [ ] [`_mm_mask_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sd&expand=5236)
   * [ ] [`_mm_mask_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_ss&expand=5236)
   * [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236)
-  * [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
-  * [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
+  * [x] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
+  * [x] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
   * [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
   * [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
   * [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236)
@@ -1316,20 +1316,20 @@
   * [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
   * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236)
   * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
-  * [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
-  * [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
+  * [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
+  * [x] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
   * [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
   * [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
-  * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
+  * [x] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
+  * [x] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
   * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
   * [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
   * [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236)
   * [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236)
   * [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236)
   * [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236)
-  * [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
+  * [x] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
+  * [x] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
   * [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
   * [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236)
@@ -1372,8 +1372,8 @@
   * [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
   * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
   * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
-  * [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
+  * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
+  * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
   * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
   * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
   * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
@@ -1392,16 +1392,16 @@
   * [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
   * [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
   * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
-  * [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
+  * [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
+  * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
   * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
   * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
   * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
   * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
   * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
   * [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
-  * [ ] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
-  * [ ] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
+  * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
+  * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
   * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
   * [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
   * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
@@ -1416,7 +1416,7 @@
   * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
   * [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
   * [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
-  * [ ] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
-  * [ ] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
+  * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
+  * [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
 
 </p>
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 852abfe349..4294ec92b5 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,722 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     transmute(r)
 }
 
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_mul_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_mul_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_mul_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmulsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_div_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_div_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_div_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vdivsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
 /// Equal
 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
 /// Less-than
@@ -19299,6 +20015,23 @@ extern "C" {
     fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
     #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
     fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+
+    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
+    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
+    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
+    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
+    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
+    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
+    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
+    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
+    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
 }
 
 #[cfg(test)]
@@ -28482,4 +29215,324 @@ mod tests {
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_add_round_ss(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 60.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_add_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_add_round_sd(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_add_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 6.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sub_round_ss(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., -20.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sub_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sub_round_sd(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sub_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., -2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_mul_round_ss(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 800.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_mul_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_mul_round_sd(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_mul_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_div_round_ss(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_div_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_div_round_sd(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_div_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
 }
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 41cbd5029a..80dad4d64e 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -451,6 +451,24 @@ impl m128Ext for __m128 {
     }
 }
 
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128dExt: Sized {
+    fn as_m128d(self) -> __m128d;
+
+    #[inline]
+    fn as_f64x2(self) -> crate::core_arch::simd::f64x2 {
+        unsafe { transmute(self.as_m128d()) }
+    }
+}
+
+impl m128dExt for __m128d {
+    #[inline]
+    fn as_m128d(self) -> Self {
+        self
+    }
+}
+
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdsimd_internal", issue = "none")]
 pub(crate) trait m256Ext: Sized {

From 11b2b65badfcba141800a9486ade247a6768bb23 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 16:06:53 +0000
Subject: [PATCH 18/25] mask_sqrt: ss,sd; sqrt_round: ss,sd;

---
 crates/core_arch/avx512f.md         |  20 +-
 crates/core_arch/src/x86/avx512f.rs | 377 ++++++++++++++++++++++++++++
 2 files changed, 387 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index cd93fa26b9..6224ef09c2 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1310,10 +1310,10 @@
   * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
   * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
   * [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
-  * [ ] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
-  * [ ] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
-  * [ ] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
-  * [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
+  * [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
+  * [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
+  * [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
+  * [x] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
   * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236)
   * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
   * [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
@@ -1388,10 +1388,10 @@
   * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
   * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
   * [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
-  * [ ] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
-  * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
+  * [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
+  * [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
+  * [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
+  * [x] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
   * [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
   * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
   * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
@@ -1414,8 +1414,8 @@
   * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
   * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
   * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
-  * [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
-  * [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
+  * [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
+  * [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
   * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
   * [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
 
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4294ec92b5..6ec24d2a00 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,70 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     transmute(r)
 }
 
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vsqrtss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vsqrtsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+    ))
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -19413,6 +19477,185 @@ pub unsafe fn _mm_maskz_div_round_sd(
     transmute(constify_imm4_round!(rounding, call))
 }
 
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsqrtsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
 /// Equal
 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
 /// Less-than
@@ -20032,6 +20275,10 @@ extern "C" {
     fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
     fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
+    fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
+    fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
 }
 
 #[cfg(test)]
@@ -29216,6 +29463,56 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
@@ -29535,4 +29832,84 @@ mod tests {
         let e = _mm_set_pd(1., 0.5);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_sqrt_round_ss(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 2., 10., 2.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_sqrt_round_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_sqrt_round_sd(
+            src,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_sqrt_round_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 2.);
+        assert_eq_m128d(r, e);
+    }
 }

From 1afd1b92d9b2f588338dd9e1fdf8e66ab8841d01 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 16:37:19 +0000
Subject: [PATCH 19/25] mask_max: ss,sd; mask_min: ss,sd; max_round: ss,sd;
 min_round: ss,sd;

---
 crates/core_arch/avx512f.md         |  44 +-
 crates/core_arch/src/x86/avx512f.rs | 640 ++++++++++++++++++++++++++++
 2 files changed, 662 insertions(+), 22 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6224ef09c2..701b65d560 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1284,18 +1284,18 @@
   * [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
   * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236)
   * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236)
-  * [ ] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
-  * [ ] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236)
-  * [ ] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236)
-  * [ ] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236)
-  * [ ] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236)
-  * [ ] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
-  * [ ] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
-  * [ ] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
+  * [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
+  * [x] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236)
+  * [x] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236)
+  * [x] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236)
+  * [x] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236)
+  * [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
+  * [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
+  * [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
   * [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
   * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
-  * [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
-  * [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
+  * [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
+  * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
   * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
   * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
   * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
@@ -1362,14 +1362,14 @@
   * [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
   * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236)
   * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236)
-  * [ ] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236)
-  * [ ] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236)
-  * [ ] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
-  * [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
+  * [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
+  * [x] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236)
+  * [x] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236)
+  * [x] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236)
+  * [x] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236)
+  * [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
+  * [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
+  * [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
   * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
   * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
   * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
@@ -1396,10 +1396,10 @@
   * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
   * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
   * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
-  * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
-  * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
-  * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
-  * [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
+  * [x] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
+  * [x] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
+  * [x] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
+  * [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
   * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
   * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
   * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 6ec24d2a00..185fba00e9 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,134 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     transmute(r)
 }
 
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vmaxss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vmaxsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vminss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vminsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
@@ -19477,6 +19605,282 @@ pub unsafe fn _mm_maskz_div_round_sd(
     transmute(constify_imm4_round!(rounding, call))
 }
 
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_max_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    sae: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_max_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vmaxsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_min_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    sae: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_min_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vminsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20275,6 +20679,14 @@ extern "C" {
     fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
     fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
+    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
+    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
+    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
+    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
     #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
     fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
@@ -29463,6 +29875,102 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_ss(a, 0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_ss(0, a, b);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_ss(0b11111111, a, b);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_sd(a, 0, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_sd(0, a, b);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_sd(0b11111111, a, b);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_mask_sqrt_ss() {
         let src = _mm_set_ps(10., 11., 100., 110.);
@@ -29833,6 +30341,138 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_max_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 3.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_ss() {
+        let a = _mm_set_ps(0., 1., 2., 3.);
+        let b = _mm_set_ps(4., 5., 6., 7.);
+        let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(0., 1., 2., 3.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_min_round_sd() {
+        let a = _mm_set_pd(0., 1.);
+        let b = _mm_set_pd(2., 3.);
+        let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(0., 1.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_sqrt_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);

From eb58b50a418d547b0a1c518641568e0fa6aaf1fa Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 17:20:07 +0000
Subject: [PATCH 20/25] rsqrt14: ss,sd; rcp14: ss,sd

---
 crates/core_arch/avx512f.md         |  24 +--
 crates/core_arch/src/x86/avx512f.rs | 305 ++++++++++++++++++++++++++++
 2 files changed, 317 insertions(+), 12 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 701b65d560..6483ea7460 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1298,14 +1298,14 @@
   * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
   * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
   * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
-  * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
-  * [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
+  * [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
+  * [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
   * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
   * [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
   * [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
   * [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
-  * [ ] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
-  * [ ] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
+  * [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
+  * [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
   * [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
   * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
   * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
@@ -1376,14 +1376,14 @@
   * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
   * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
   * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
-  * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
-  * [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
+  * [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
+  * [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
   * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
   * [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
   * [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
   * [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
-  * [ ] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
-  * [ ] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
+  * [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
+  * [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
   * [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
   * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
   * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
@@ -1402,14 +1402,14 @@
   * [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
   * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
   * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
-  * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
-  * [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
+  * [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
+  * [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
   * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
   * [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
   * [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
   * [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
-  * [ ] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
-  * [ ] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
+  * [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
+  * [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
   * [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
   * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
   * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 185fba00e9..8c93b318dc 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18889,6 +18889,166 @@ pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
     ))
 }
 
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrsqrt14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrsqrt14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vrcp14ss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+    ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vrcp14sd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+    ))
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20691,6 +20851,15 @@ extern "C" {
     fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
     fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
+    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
+    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+    #[link_name = "llvm.x86.avx512.rcp14.ss"]
+    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+    #[link_name = "llvm.x86.avx512.rcp14.sd"]
+    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
 }
 
 #[cfg(test)]
@@ -30021,6 +30190,142 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rsqrt14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rsqrt14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.5);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rsqrt14_sd(a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rsqrt14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rsqrt14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rsqrt14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.5);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_rcp14_ss(a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_mask_rcp14_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 4.);
+        let r = _mm_maskz_rcp14_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_rcp14_sd(a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_rcp14_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_rcp14_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_rcp14_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_rcp14_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 0.25);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);

From b8a68500fcd74c239820a35023ccfab809ed9f99 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 18:08:10 +0000
Subject: [PATCH 21/25] getexp: ss,sd; getexp_round: ss,sd; getmant: ss,sd;
 getmant_round: ss,sd;

---
 crates/core_arch/avx512f.md         |   48 +-
 crates/core_arch/src/x86/avx512f.rs | 1896 +++++++++++++++++++++------
 2 files changed, 1507 insertions(+), 437 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6483ea7460..5b2aaa1c21 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1214,14 +1214,14 @@
   * [ ] [`_mm_fnmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_ss&expand=5236)
   * [ ] [`_mm_fnmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sd&expand=5236)
   * [ ] [`_mm_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_ss&expand=5236)
-  * [ ] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236)
-  * [ ] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236)
-  * [ ] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236)
-  * [ ] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236)
-  * [ ] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236)
-  * [ ] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
-  * [ ] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
-  * [ ] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
+  * [x] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236)
+  * [x] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236)
+  * [x] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236)
+  * [x] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236)
+  * [x] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236)
+  * [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
+  * [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
+  * [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
   * [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
   * [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
   * [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
@@ -1274,14 +1274,14 @@
   * [ ] [`_mm_mask_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_ss&expand=5236)
   * [ ] [`_mm_mask_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sd&expand=5236)
   * [ ] [`_mm_mask_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ss&expand=5236)
-  * [ ] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236)
-  * [ ] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236)
-  * [ ] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236)
-  * [ ] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236)
-  * [ ] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236)
-  * [ ] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236)
-  * [ ] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236)
-  * [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
+  * [x] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236)
+  * [x] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236)
+  * [x] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236)
+  * [x] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236)
+  * [x] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236)
+  * [x] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236)
+  * [x] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236)
+  * [x] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
   * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236)
   * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236)
   * [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
@@ -1352,14 +1352,14 @@
   * [ ] [`_mm_maskz_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_ss&expand=5236)
   * [ ] [`_mm_maskz_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sd&expand=5236)
   * [ ] [`_mm_maskz_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ss&expand=5236)
-  * [ ] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236)
-  * [ ] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236)
-  * [ ] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236)
-  * [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
+  * [x] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236)
+  * [x] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236)
+  * [x] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236)
+  * [x] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236)
+  * [x] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236)
+  * [x] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236)
+  * [x] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236)
+  * [x] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
   * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236)
   * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236)
   * [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 8c93b318dc..24b6245e17 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19049,6 +19049,342 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
     ))
 }
 
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vgetexpss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b1,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vgetexpsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_NO_EXC,
+    ))
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_getmant_ss(
+    a: __m128,
+    b: __m128,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_getmant_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
+                src.as_f32x4(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_getmant_ss(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_getmant_sd(
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_getmant_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                src.as_f64x2(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_getmant_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas!(norm, sign, call);
+    transmute(r)
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -19058,15 +19394,373 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vaddsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vsubsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(
+            vmulss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19078,7 +19772,7 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19087,12 +19781,12 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_ss(
+pub unsafe fn _mm_mask_mul_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
@@ -19101,13 +19795,13 @@ pub unsafe fn _mm_mask_add_round_ss(
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19116,15 +19810,15 @@ pub unsafe fn _mm_mask_add_round_ss(
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddss(
+            vmulss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19136,7 +19830,7 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19145,15 +19839,15 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(
+            vmulsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19165,7 +19859,7 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19174,12 +19868,12 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_sd(
+pub unsafe fn _mm_mask_mul_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
@@ -19188,13 +19882,13 @@ pub unsafe fn _mm_mask_add_round_sd(
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19203,12 +19897,12 @@ pub unsafe fn _mm_mask_add_round_sd(
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_sd(
+pub unsafe fn _mm_maskz_mul_round_sd(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
@@ -19216,7 +19910,7 @@ pub unsafe fn _mm_maskz_add_round_sd(
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vaddsd(
+            vmulsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19228,7 +19922,7 @@ pub unsafe fn _mm_maskz_add_round_sd(
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19237,15 +19931,15 @@ pub unsafe fn _mm_maskz_add_round_sd(
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(
+            vdivss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19257,7 +19951,7 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19266,12 +19960,12 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_ss(
+pub unsafe fn _mm_mask_div_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
@@ -19280,13 +19974,13 @@ pub unsafe fn _mm_mask_sub_round_ss(
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19295,15 +19989,15 @@ pub unsafe fn _mm_mask_sub_round_ss(
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubss(
+            vdivss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19315,7 +20009,7 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19324,15 +20018,15 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(
+            vdivsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19344,7 +20038,7 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19353,12 +20047,12 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_sd(
+pub unsafe fn _mm_mask_div_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
@@ -19367,13 +20061,13 @@ pub unsafe fn _mm_mask_sub_round_sd(
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19382,12 +20076,12 @@ pub unsafe fn _mm_mask_sub_round_sd(
 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_sd(
+pub unsafe fn _mm_maskz_div_round_sd(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
@@ -19395,7 +20089,7 @@ pub unsafe fn _mm_maskz_sub_round_sd(
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vsubsd(
+            vdivsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19407,24 +20101,18 @@ pub unsafe fn _mm_maskz_sub_round_sd(
     transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(
+            vmaxss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19433,56 +20121,44 @@ pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_ss(
+pub unsafe fn _mm_mask_max_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
+    sae: i32,
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulss(
+            vmaxss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19491,27 +20167,21 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(
+            vmaxsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19519,62 +20189,45 @@ pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
                 $imm4,
             )
         };
-    }
-    transmute(constify_imm4_round!(rounding, call))
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+    }
+    transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_sd(
+pub unsafe fn _mm_mask_max_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
+    sae: i32,
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_sd(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    rounding: i32,
-) -> __m128d {
+pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmulsd(
+            vmaxsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19583,27 +20236,21 @@ pub unsafe fn _mm_maskz_mul_round_sd(
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(
+            vminss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19612,56 +20259,44 @@ pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_ss(
+pub unsafe fn _mm_mask_min_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
+    sae: i32,
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivss(
+            vminss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19670,27 +20305,21 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(
+            vminsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19699,61 +20328,44 @@ pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_sd(
+pub unsafe fn _mm_mask_min_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
+    sae: i32,
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_sd(
-    k: __mmask8,
-    a: __m128d,
-    b: __m128d,
-    rounding: i32,
-) -> __m128d {
+pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vdivsd(
+            vminsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19762,21 +20374,27 @@ pub unsafe fn _mm_maskz_div_round_sd(
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    transmute(constify_imm4_sae!(sae, call))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(
+            vsqrtss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19785,44 +20403,56 @@ pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_ss(
+pub unsafe fn _mm_mask_sqrt_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    sae: i32,
+    rounding: i32,
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxss(
+            vsqrtss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19831,21 +20461,27 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(
+            vsqrtsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19854,44 +20490,61 @@ pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_sd(
+pub unsafe fn _mm_mask_sqrt_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    sae: i32,
+    rounding: i32,
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_maskz_sqrt_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    rounding: i32,
+) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vmaxsd(
+            vsqrtsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19900,21 +20553,21 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    transmute(constify_imm4_round!(rounding, call))
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(
+            vgetexpss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19923,18 +20576,19 @@ pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_ss(
+pub unsafe fn _mm_mask_getexp_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
@@ -19943,24 +20597,25 @@ pub unsafe fn _mm_mask_min_round_ss(
 ) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+            vgetexpss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
-            vminss(
+            vgetexpss(
                 a.as_f32x4(),
                 b.as_f32x4(),
                 _mm_setzero_ps().as_f32x4(),
@@ -19969,21 +20624,22 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
 #[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(
+            vgetexpsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -19992,18 +20648,19 @@ pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
 #[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_sd(
+pub unsafe fn _mm_mask_getexp_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
@@ -20012,24 +20669,25 @@ pub unsafe fn _mm_mask_min_round_sd(
 ) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+            vgetexpsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
 #[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
-            vminsd(
+            vgetexpsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
                 _mm_setzero_pd().as_f64x2(),
@@ -20038,186 +20696,254 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
             )
         };
     }
-    transmute(constify_imm4_sae!(sae, call))
+    let r = constify_imm4_sae!(sae, call);
+    transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_ss(
+    a: __m128,
+    b: __m128,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128 {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantss(
                 a.as_f32x4(),
                 b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
                 _mm_setzero_ps().as_f32x4(),
                 0b1,
-                $imm4,
+                $imm4_2,
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss(
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_ss(
     src: __m128,
     k: __mmask8,
     a: __m128,
     b: __m128,
-    rounding: i32,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
 ) -> __m128 {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
+                src.as_f32x4(),
+                k,
+                $imm4_2,
+            )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
 }
 
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_ss(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128 {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtss(
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantss(
                 a.as_f32x4(),
                 b.as_f32x4(),
+                $imm2 << 2 | $imm4_1,
                 _mm_setzero_ps().as_f32x4(),
                 k,
-                $imm4,
+                $imm4_2,
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_sd(
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128d {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
                 _mm_setzero_pd().as_f64x2(),
                 0b1,
-                $imm4,
+                $imm4_2,
             )
-        };
-    }
-    transmute(constify_imm4_round!(rounding, call))
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+        };
+    }
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
 ) -> __m128d {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                src.as_f64x2(),
+                k,
+                $imm4_2,
+            )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
 }
 
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
-///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
-///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
-///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_sd(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    rounding: i32,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
 ) -> __m128d {
     macro_rules! call {
-        ($imm4:expr) => {
-            vsqrtsd(
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
                 _mm_setzero_pd().as_f64x2(),
                 k,
-                $imm4,
+                $imm4_2,
             )
         };
     }
-    transmute(constify_imm4_round!(rounding, call))
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
 }
 
 /// Equal
@@ -20851,6 +21577,14 @@ extern "C" {
     fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
     fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
+    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
+    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
+    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
+    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
 
     #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
     fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
@@ -30326,6 +31060,138 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_ss(a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_ss(a, 0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_ss(0, a, b);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_sd(a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_sd(a, 0, a, b);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_sd(0, a, b);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_getmant_ss(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_ss(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_ss(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_ss(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_ss(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_getmant_sd(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_sd(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_sd(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_sd(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_sd(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
@@ -30857,4 +31723,208 @@ mod tests {
         let e = _mm_set_pd(1., 2.);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2., 2., 2., 2.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_ss() {
+        let a = _mm_set1_ps(2.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2., 2., 2., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2., 2., 2., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2., 2.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getexp_round_sd() {
+        let a = _mm_set1_pd(2.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2., 1.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_getmant_round_ss(
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_mask_getmant_round_ss(
+            a,
+            0,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_ps(20., 20., 20., 20.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_getmant_round_ss(
+            a,
+            0b11111111,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_ss() {
+        let a = _mm_set1_ps(20.);
+        let b = _mm_set1_ps(10.);
+        let r = _mm_maskz_getmant_round_ss(
+            0,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_ps(20., 20., 20., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_getmant_round_ss(
+            0b11111111,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_ps(20., 20., 20., 1.25);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_getmant_round_sd(
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_mask_getmant_round_sd(
+            a,
+            0,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_pd(20., 20.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_getmant_round_sd(
+            a,
+            0b11111111,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_getmant_round_sd() {
+        let a = _mm_set1_pd(20.);
+        let b = _mm_set1_pd(10.);
+        let r = _mm_maskz_getmant_round_sd(
+            0,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_pd(20., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_getmant_round_sd(
+            0b11111111,
+            a,
+            b,
+            _MM_MANT_NORM_1_2,
+            _MM_MANT_SIGN_SRC,
+            _MM_FROUND_CUR_DIRECTION,
+        );
+        let e = _mm_set_pd(20., 1.25);
+        assert_eq_m128d(r, e);
+    }
 }

From 16d6dd60abb7f542b5f5d00a6541b47ee2268d10 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 20:24:22 +0000
Subject: [PATCH 22/25] roundscale: ss,sd; roundscale_round: ss,sd; scalef:
 ss,sd; scalef_round: ss,sd;

---
 crates/core_arch/avx512f.md         |   48 +-
 crates/core_arch/src/x86/avx512f.rs | 1107 +++++++++++++++++++++++++--
 2 files changed, 1064 insertions(+), 91 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 5b2aaa1c21..052395b275 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1300,16 +1300,16 @@
   * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
   * [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
   * [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
-  * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
-  * [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
-  * [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
-  * [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
+  * [x] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
+  * [x] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
+  * [x] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
+  * [x] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
   * [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
   * [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
-  * [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
-  * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
-  * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
-  * [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
+  * [x] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
+  * [x] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
+  * [x] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
+  * [x] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
   * [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
   * [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
   * [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
@@ -1378,16 +1378,16 @@
   * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
   * [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
   * [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
-  * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
-  * [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
+  * [x] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
+  * [x] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
+  * [x] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
+  * [x] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
   * [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
   * [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
-  * [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
-  * [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
+  * [x] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
+  * [x] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
+  * [x] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
+  * [x] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
   * [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
   * [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
   * [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
@@ -1404,16 +1404,16 @@
   * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
   * [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
   * [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
-  * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
-  * [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
-  * [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
-  * [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
+  * [x] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
+  * [x] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
+  * [x] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
+  * [x] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
   * [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
   * [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
-  * [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
-  * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
-  * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
-  * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
+  * [x] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
+  * [x] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
+  * [x] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
+  * [x] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
   * [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
   * [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
   * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 24b6245e17..a7e298c550 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19385,6 +19385,294 @@ pub unsafe fn _mm_maskz_getmant_sd(
     transmute(r)
 }
 
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 255))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaless(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b11111111,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_roundscale_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    imm8: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaless(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                src.as_f32x4(),
+                k,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_roundscale_ss(k: __mmask8, a: __m128, b: __m128, imm8: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscaless(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 255))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalesd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b11111111,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_roundscale_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    imm8: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalesd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                src.as_f64x2(),
+                k,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_roundscale_sd(k: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr) => {
+            vrndscalesd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm8,
+                _MM_FROUND_CUR_DIRECTION,
+            )
+        };
+    }
+    let r = constify_imm8_sae!(imm8, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        src.as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    transmute(vscalefss(
+        a.as_f32x4(),
+        b.as_f32x4(),
+        _mm_setzero_ps().as_f32x4(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        0b11111111,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        src.as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    transmute(vscalefsd(
+        a.as_f64x2(),
+        b.as_f64x2(),
+        _mm_setzero_pd().as_f64x2(),
+        k,
+        _MM_FROUND_CUR_DIRECTION,
+    ))
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20835,114 +21123,504 @@ pub unsafe fn _mm_maskz_getmant_round_ss(
 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_sd(
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_pd().as_f64x2(),
+                0b1,
+                $imm4_2,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                src.as_f64x2(),
+                k,
+                $imm4_2,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+///    _MM_MANT_NORM_1_2     // interval [1, 2)
+///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
+///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
+///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+///    _MM_MANT_SIGN_src     // sign = sign(src)
+///    _MM_MANT_SIGN_zero    // sign = 0
+///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    norm: _MM_MANTISSA_NORM_ENUM,
+    sign: _MM_MANTISSA_SIGN_ENUM,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+            vgetmantsd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                $imm2 << 2 | $imm4_1,
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm4_2,
+            )
+        };
+    }
+    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_roundscale_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaless(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b11111111,
+                $imm8,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    imm8: i32,
+    sae: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaless(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm8, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_ss(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    imm8: i32,
+    sae: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscaless(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm8,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_roundscale_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalesd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                0b11111111,
+                $imm8,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_sd(
+    src: __m128d,
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    imm8: i32,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalesd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm8, $imm4)
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+///    _MM_FROUND_TO_NEAREST_INT // round to nearest
+///    _MM_FROUND_TO_NEG_INF     // round down
+///    _MM_FROUND_TO_POS_INF     // round up
+///    _MM_FROUND_TO_ZERO        // truncate
+///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    imm8: i32,
+    sae: i32,
+) -> __m128d {
+    macro_rules! call {
+        ($imm8:expr, $imm4:expr) => {
+            vrndscalesd(
+                a.as_f64x2(),
+                b.as_f64x2(),
+                _mm_setzero_pd().as_f64x2(),
+                k,
+                $imm8,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm8_roundscale!(imm8, sae, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                0b11111111,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_scalef_round_ss(
+    src: __m128,
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_scalef_round_ss(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    rounding: i32,
+) -> __m128 {
+    macro_rules! call {
+        ($imm4:expr) => {
+            vscalefss(
+                a.as_f32x4(),
+                b.as_f32x4(),
+                _mm_setzero_ps().as_f32x4(),
+                k,
+                $imm4,
+            )
+        };
+    }
+    let r = constify_imm4_round!(rounding, call);
+    transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(2, 3, 4)]
-pub unsafe fn _mm_getmant_round_sd(
-    a: __m128d,
-    b: __m128d,
-    norm: _MM_MANTISSA_NORM_ENUM,
-    sign: _MM_MANTISSA_SIGN_ENUM,
-    sae: i32,
-) -> __m128d {
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
-        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
+        ($imm4:expr) => {
+            vscalefsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
                 _mm_setzero_pd().as_f64x2(),
-                0b1,
-                $imm4_2,
+                0b11111111,
+                $imm4,
             )
         };
     }
-    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    let r = constify_imm4_round!(rounding, call);
     transmute(r)
 }
 
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///    _MM_MANT_NORM_1_2     // interval [1, 2)
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-///    _MM_MANT_SIGN_src     // sign = sign(src)
-///    _MM_MANT_SIGN_zero    // sign = 0
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(4, 5, 6)]
-pub unsafe fn _mm_mask_getmant_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_scalef_round_sd(
     src: __m128d,
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    norm: _MM_MANTISSA_NORM_ENUM,
-    sign: _MM_MANTISSA_SIGN_ENUM,
-    sae: i32,
+    rounding: i32,
 ) -> __m128d {
     macro_rules! call {
-        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
-                a.as_f64x2(),
-                b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
-                src.as_f64x2(),
-                k,
-                $imm4_2,
-            )
+        ($imm4:expr) => {
+            vscalefsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
         };
     }
-    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    let r = constify_imm4_round!(rounding, call);
     transmute(r)
 }
 
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-///    _MM_MANT_NORM_1_2     // interval [1, 2)
-///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
-///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
-///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-///    _MM_MANT_SIGN_src     // sign = sign(src)
-///    _MM_MANT_SIGN_zero    // sign = 0
-///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
 ///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
 #[inline]
 #[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(3, 4, 5)]
-pub unsafe fn _mm_maskz_getmant_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_scalef_round_sd(
     k: __mmask8,
     a: __m128d,
     b: __m128d,
-    norm: _MM_MANTISSA_NORM_ENUM,
-    sign: _MM_MANTISSA_SIGN_ENUM,
-    sae: i32,
+    rounding: i32,
 ) -> __m128d {
     macro_rules! call {
-        ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
-            vgetmantsd(
+        ($imm4:expr) => {
+            vscalefsd(
                 a.as_f64x2(),
                 b.as_f64x2(),
-                $imm2 << 2 | $imm4_1,
                 _mm_setzero_pd().as_f64x2(),
                 k,
-                $imm4_2,
+                $imm4,
             )
         };
     }
-    let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+    let r = constify_imm4_round!(rounding, call);
     transmute(r)
 }
 
@@ -21594,6 +22272,15 @@ extern "C" {
     fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
     #[link_name = "llvm.x86.avx512.rcp14.sd"]
     fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
+    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
+    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
+    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
+    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
+    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
 }
 
 #[cfg(test)]
@@ -31192,6 +31879,138 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_ss(a, b, 0);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_ss(a, 0, a, b, 0);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_ss(a, 0b11111111, a, b, 0);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_ss(0, a, b, 0);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_ss(0b11111111, a, b, 0);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_sd(a, b, 0);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_sd(a, 0, a, b, 0);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_sd(a, 0b11111111, a, b, 0);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_sd(0, a, b, 0);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_sd(0b11111111, a, b, 0);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_ss(a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_ss(a, 0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_ss(0, a, b);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_sd(a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_sd(a, 0, a, b);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
@@ -31927,4 +32746,158 @@ mod tests {
         let e = _mm_set_pd(20., 1.25);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_roundscale_round_ss(a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_mask_roundscale_round_ss(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_roundscale_round_ss(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_ss() {
+        let a = _mm_set1_ps(2.2);
+        let b = _mm_set1_ps(1.1);
+        let r = _mm_maskz_roundscale_round_ss(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_roundscale_round_ss(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_roundscale_round_sd(a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_mask_roundscale_round_sd(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2.2, 2.2);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_roundscale_round_sd(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_roundscale_round_sd() {
+        let a = _mm_set1_pd(2.2);
+        let b = _mm_set1_pd(1.1);
+        let r = _mm_maskz_roundscale_round_sd(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2.2, 0.0);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_roundscale_round_sd(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+        let e = _mm_set_pd(2.2, 1.0);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_scalef_round_ss(
+            a,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(3.);
+        let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_scalef_round_ss(
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 1., 1., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 1.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_scalef_round_sd(
+            a,
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_scalef_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(3.);
+        let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_scalef_round_sd(
+            0b11111111,
+            a,
+            b,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 8.);
+        assert_eq_m128d(r, e);
+    }
 }

From 8052b6430474af564f9ee40e25f58ca9384e8614 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Thu, 15 Oct 2020 23:58:17 +0000
Subject: [PATCH 23/25] mask_move: ss,sd

---
 crates/core_arch/avx512f.md         |   8 +-
 crates/core_arch/src/x86/avx512f.rs | 112 ++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 052395b275..ae452723b0 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1292,8 +1292,8 @@
   * [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
   * [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
   * [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
-  * [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
-  * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
+  * [x] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
+  * [x] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
   * [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
   * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
   * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
@@ -1370,8 +1370,8 @@
   * [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
   * [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
   * [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
-  * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
-  * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
+  * [x] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
+  * [x] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
   * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
   * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
   * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a7e298c550..270d8c5859 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18417,6 +18417,68 @@ pub unsafe fn _mm512_set_pd(
     _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
 }
 
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let extractsrc: f32 = simd_extract(src, 0);
+    let mut mov: f32 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+    let mut mov: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let extractsrc: f64 = simd_extract(src, 0);
+    let mut mov: f64 = extractsrc;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+    let mut mov: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        mov = simd_extract(b, 0);
+    }
+    let r = simd_insert(a, 0, mov);
+    transmute(r)
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
@@ -31265,6 +31327,56 @@ mod tests {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_ss() {
+        let src = _mm_set_ps(10., 11., 100., 110.);
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_mask_move_ss(src, 0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 110.);
+        assert_eq_m128(r, e);
+        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_ss() {
+        let a = _mm_set_ps(1., 2., 10., 20.);
+        let b = _mm_set_ps(3., 4., 30., 40.);
+        let r = _mm_maskz_move_ss(0, a, b);
+        let e = _mm_set_ps(1., 2., 10., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_move_ss(0b11111111, a, b);
+        let e = _mm_set_ps(1., 2., 10., 40.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_move_sd() {
+        let src = _mm_set_pd(10., 11.);
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_mask_move_sd(src, 0, a, b);
+        let e = _mm_set_pd(1., 11.);
+        assert_eq_m128d(r, e);
+        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_move_sd() {
+        let a = _mm_set_pd(1., 2.);
+        let b = _mm_set_pd(3., 4.);
+        let r = _mm_maskz_move_sd(0, a, b);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_move_sd(0b11111111, a, b);
+        let e = _mm_set_pd(1., 4.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_mask_add_ss() {
         let src = _mm_set_ps(10., 11., 100., 110.);

From 577cbc4169fd5d136a05095f7efed0022d2a2d1e Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 16 Oct 2020 17:13:37 +0000
Subject: [PATCH 24/25] mask_fmadd: ss,sd; fmadd_round: ss,sd;

---
 crates/core_arch/avx512f.md         |  28 +-
 crates/core_arch/src/x86/avx512f.rs | 587 ++++++++++++++++++++++++++++
 2 files changed, 601 insertions(+), 14 deletions(-)

diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ae452723b0..75b8c725df 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1206,8 +1206,8 @@
   * [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236)
   * [ ] [`_mm_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_ss&expand=5236)
-  * [ ] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236)
-  * [ ] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236)
+  * [x] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236)
+  * [x] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236)
   * [ ] [`_mm_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sd&expand=5236)
   * [ ] [`_mm_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_ss&expand=5236)
   * [ ] [`_mm_fnmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sd&expand=5236)
@@ -1222,10 +1222,10 @@
   * [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
   * [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
   * [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
-  * [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
-  * [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
-  * [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
-  * [ ] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236)
+  * [x] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
+  * [x] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
+  * [x] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
+  * [x] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236)
   * [ ] [`_mm_mask3_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sd&expand=5236)
   * [ ] [`_mm_mask3_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_ss&expand=5236)
   * [ ] [`_mm_mask3_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sd&expand=5236)
@@ -1258,10 +1258,10 @@
   * [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236)
   * [ ] [`_mm_mask_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_ss&expand=5236)
-  * [ ] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236)
-  * [ ] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236)
-  * [ ] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236)
-  * [ ] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236)
+  * [x] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236)
+  * [x] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236)
+  * [x] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236)
+  * [x] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236)
   * [ ] [`_mm_mask_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sd&expand=5236)
   * [ ] [`_mm_mask_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_ss&expand=5236)
   * [ ] [`_mm_mask_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sd&expand=5236)
@@ -1336,10 +1336,10 @@
   * [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236)
   * [ ] [`_mm_maskz_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_ss&expand=5236)
-  * [ ] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236)
-  * [ ] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236)
-  * [ ] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236)
-  * [ ] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236)
+  * [x] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236)
+  * [x] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236)
+  * [x] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236)
+  * [x] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236)
   * [ ] [`_mm_maskz_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sd&expand=5236)
   * [ ] [`_mm_maskz_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_ss&expand=5236)
   * [ ] [`_mm_maskz_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 270d8c5859..d231c42b19 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19735,6 +19735,110 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
     ))
 }
 
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+    let mut fmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+    let mut fmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+    let mut fmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+    let mut fmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
 ///
 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -21686,6 +21790,280 @@ pub unsafe fn _mm_maskz_scalef_round_sd(
     transmute(r)
 }
 
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+    let extracta: f32 = simd_extract(a, 0);
+    let extractb: f32 = simd_extract(b, 0);
+    let extractc: f32 = simd_extract(c, 0);
+    macro_rules! call {
+        ($imm4:expr) => {
+            vfmadd132ss(extracta, extractb, extractc, $imm4)
+        };
+    }
+    let fmadd = constify_imm4_round!(rounding, call);
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_fmadd_round_ss(
+    a: __m128,
+    k: __mmask8,
+    b: __m128,
+    c: __m128,
+    rounding: i32,
+) -> __m128 {
+    let mut fmadd: f32 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132ss(fmadd, extractb, extractc, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_maskz_fmadd_round_ss(
+    k: __mmask8,
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    rounding: i32,
+) -> __m128 {
+    let mut fmadd: f32 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        let extractc: f32 = simd_extract(c, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132ss(extracta, extractb, extractc, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask3_fmadd_round_ss(
+    a: __m128,
+    b: __m128,
+    c: __m128,
+    k: __mmask8,
+    rounding: i32,
+) -> __m128 {
+    let mut fmadd: f32 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f32 = simd_extract(a, 0);
+        let extractb: f32 = simd_extract(b, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132ss(extracta, extractb, fmadd, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+    let extracta: f64 = simd_extract(a, 0);
+    let extractb: f64 = simd_extract(b, 0);
+    let extractc: f64 = simd_extract(c, 0);
+    macro_rules! call {
+        ($imm4:expr) => {
+            vfmadd132sd(extracta, extractb, extractc, $imm4)
+        };
+    }
+    let fmadd = constify_imm4_round!(rounding, call);
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_fmadd_round_sd(
+    a: __m128d,
+    k: __mmask8,
+    b: __m128d,
+    c: __m128d,
+    rounding: i32,
+) -> __m128d {
+    let mut fmadd: f64 = simd_extract(a, 0);
+    if (k & 0b00000001) != 0 {
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132sd(fmadd, extractb, extractc, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_maskz_fmadd_round_sd(
+    k: __mmask8,
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    rounding: i32,
+) -> __m128d {
+    let mut fmadd: f64 = 0.;
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        let extractc: f64 = simd_extract(c, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132sd(extracta, extractb, extractc, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(a, 0, fmadd);
+    transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
+///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
+///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
+///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask3_fmadd_round_sd(
+    a: __m128d,
+    b: __m128d,
+    c: __m128d,
+    k: __mmask8,
+    rounding: i32,
+) -> __m128d {
+    let mut fmadd: f64 = simd_extract(c, 0);
+    if (k & 0b00000001) != 0 {
+        let extracta: f64 = simd_extract(a, 0);
+        let extractb: f64 = simd_extract(b, 0);
+        macro_rules! call {
+            ($imm4:expr) => {
+                vfmadd132sd(extracta, extractb, fmadd, $imm4)
+            };
+        }
+        fmadd = constify_imm4_round!(rounding, call);
+    }
+    let r = simd_insert(c, 0, fmadd);
+    transmute(r)
+}
+
 /// Equal
 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
 /// Less-than
@@ -22343,6 +22721,11 @@ extern "C" {
     fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
     #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
     fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
+    fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
+    fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;
 }
 
 #[cfg(test)]
@@ -32123,6 +32506,80 @@ mod tests {
         assert_eq_m128d(r, e);
     }
 
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_ss(a, 0, b, c);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_ss(0, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_sd(a, 0, b, c);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_sd(0, a, b, c);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
+
     #[simd_test(enable = "avx512f")]
     unsafe fn test_mm_add_round_ss() {
         let a = _mm_set_ps(1., 2., 10., 20.);
@@ -33012,4 +33469,134 @@ mod tests {
         let e = _mm_set_pd(1., 8.);
         assert_eq_m128d(r, e);
     }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m128(r, a);
+        let r = _mm_mask_fmadd_round_ss(
+            a,
+            0b11111111,
+            b,
+            c,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_ps(1., 1., 1., 0.);
+        assert_eq_m128(r, e);
+        let r = _mm_maskz_fmadd_round_ss(
+            0b11111111,
+            a,
+            b,
+            c,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(1., 1., 1., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_ss() {
+        let a = _mm_set1_ps(1.);
+        let b = _mm_set1_ps(2.);
+        let c = _mm_set1_ps(3.);
+        let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m128(r, c);
+        let r = _mm_mask3_fmadd_round_ss(
+            a,
+            b,
+            c,
+            0b11111111,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_ps(3., 3., 3., 5.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m128d(r, a);
+        let r = _mm_mask_fmadd_round_sd(
+            a,
+            0b11111111,
+            b,
+            c,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_maskz_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        let e = _mm_set_pd(1., 0.);
+        assert_eq_m128d(r, e);
+        let r = _mm_maskz_fmadd_round_sd(
+            0b11111111,
+            a,
+            b,
+            c,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(1., 5.);
+        assert_eq_m128d(r, e);
+    }
+
+    #[simd_test(enable = "avx512f")]
+    unsafe fn test_mm_mask3_fmadd_round_sd() {
+        let a = _mm_set1_pd(1.);
+        let b = _mm_set1_pd(2.);
+        let c = _mm_set1_pd(3.);
+        let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        assert_eq_m128d(r, c);
+        let r = _mm_mask3_fmadd_round_sd(
+            a,
+            b,
+            c,
+            0b11111111,
+            _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+        );
+        let e = _mm_set_pd(3., 5.);
+        assert_eq_m128d(r, e);
+    }
 }

From 17659b26a6ea8debfec02536b42d05f4440d52e4 Mon Sep 17 00:00:00 2001
From: jironglin <jironglin@gmail.com>
Date: Fri, 16 Oct 2020 17:42:48 +0000
Subject: [PATCH 25/25] fix duplicated comment, remove assert int2mask

---
 crates/core_arch/src/x86/avx512f.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index d231c42b19..7cea13c48c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2632,8 +2632,6 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi64(
     transmute(simd_select_bitmask(k, ternarylogic, zero))
 }
 
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-
 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
@@ -15910,7 +15908,6 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
 #[inline]
 #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
 pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
-    assert!(mask >= 0);
     let r: u16 = mask as u16;
     transmute(r)
 }