From 972452ab43aed83e9812a3c5b15355c837ac1aa3 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 13:21:20 +0000
Subject: [PATCH 01/25] roundscale_round: ps,pd
---
crates/core_arch/avx512f.md | 12 +-
crates/core_arch/src/x86/avx512f.rs | 226 +++++++++++
crates/core_arch/src/x86/macros.rs | 524 +++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 29 ++
4 files changed, 785 insertions(+), 6 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 110c31eeea..58be7ab632 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -609,8 +609,8 @@
* [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236)
* [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
* [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
- * [ ] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
- * [ ] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
+ * [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
+ * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
* [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
* [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236)
* [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
@@ -881,8 +881,8 @@
* [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236)
* [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
* [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
- * [ ] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
- * [ ] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
+ * [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
+ * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
* [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
* [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236)
* [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
@@ -1019,8 +1019,8 @@
* [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236)
* [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
* [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
- * [ ] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
- * [ ] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
+ * [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
+ * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
* [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
* [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236)
* [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 25b91f7d9e..90b707d93c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -4845,6 +4845,196 @@ pub unsafe fn _mm512_maskz_getexp_round_pd(k: __mmask8, a: __m512d, sae: i32) ->
transmute(r)
}
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_ps&expand=4790)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(1, 2)]
+pub unsafe fn _mm512_roundscale_round_ps(a: __m512, imm8: i32, sae: i32) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaleps(
+ a.as_f32x16(),
+ $imm8,
+ _mm512_setzero_ps().as_f32x16(),
+ 0b11111111_11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_ps&expand=4788)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_ps(
+ src: __m512,
+ k: __mmask16,
+ a: __m512,
+ imm8: i32,
+ sae: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaleps(a.as_f32x16(), $imm8, src.as_f32x16(), k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_ps&expand=4789)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_ps(
+ k: __mmask16,
+ a: __m512,
+ imm8: i32,
+ sae: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaleps(
+ a.as_f32x16(),
+ $imm8,
+ _mm512_setzero_ps().as_f32x16(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_pd&expand=4787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(1, 2)]
+pub unsafe fn _mm512_roundscale_round_pd(a: __m512d, imm8: i32, sae: i32) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalepd(
+ a.as_f64x8(),
+ $imm8,
+ _mm512_setzero_pd().as_f64x8(),
+ 0b11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_pd&expand=4785)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_mask_roundscale_round_pd(
+ src: __m512d,
+ k: __mmask8,
+ a: __m512d,
+ imm8: i32,
+ sae: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalepd(a.as_f64x8(), $imm8, src.as_f64x8(), k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_pd&expand=4786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm512_maskz_roundscale_round_pd(
+ k: __mmask8,
+ a: __m512d,
+ imm8: i32,
+ sae: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalepd(
+ a.as_f64x8(),
+ $imm8,
+ _mm512_setzero_pd().as_f64x8(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -17206,6 +17396,11 @@ extern "C" {
#[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
+ #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
+ fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
+ #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
+ fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+
#[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -20630,6 +20825,37 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_roundscale_round_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_roundscale_round_ps(a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_roundscale_round_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_mask_roundscale_round_ps(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_ps(1.1);
+ assert_eq_m512(r, e);
+ let r =
+ _mm512_mask_roundscale_round_ps(a, 0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_roundscale_round_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_maskz_roundscale_round_ps(0, a, 0, _MM_FROUND_CUR_DIRECTION);
+ assert_eq_m512(r, _mm512_setzero_ps());
+ let r =
+ _mm512_maskz_roundscale_round_ps(0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_ps() {
let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs
index 891286df4e..0f9d938eaa 100644
--- a/crates/core_arch/src/x86/macros.rs
+++ b/crates/core_arch/src/x86/macros.rs
@@ -733,6 +733,530 @@ macro_rules! constify_imm8_sae {
};
}
+// Two sae parameters.
+// This macro enforces that.
+#[allow(unused)]
+macro_rules! constify_imm8_roundscale {
+ ($imm8:expr, $imm4:expr, $expand:ident) => {
+ #[allow(overflowing_literals)]
+ match ($imm8 & 0b11111111, $imm4) {
+ (0, 4) => $expand!(0, 4),
+ (0, 8) => $expand!(0, 8),
+ (1, 4) => $expand!(1, 4),
+ (1, 8) => $expand!(1, 8),
+ (2, 4) => $expand!(2, 4),
+ (2, 8) => $expand!(2, 8),
+ (3, 4) => $expand!(3, 4),
+ (3, 8) => $expand!(3, 8),
+ (4, 4) => $expand!(4, 4),
+ (4, 8) => $expand!(4, 8),
+ (5, 4) => $expand!(5, 4),
+ (5, 8) => $expand!(5, 8),
+ (6, 4) => $expand!(6, 4),
+ (6, 8) => $expand!(6, 8),
+ (7, 4) => $expand!(7, 4),
+ (7, 8) => $expand!(7, 8),
+ (8, 4) => $expand!(8, 4),
+ (8, 8) => $expand!(8, 8),
+ (9, 4) => $expand!(9, 4),
+ (9, 8) => $expand!(9, 8),
+ (10, 4) => $expand!(10, 4),
+ (10, 8) => $expand!(10, 8),
+ (11, 4) => $expand!(11, 4),
+ (11, 8) => $expand!(11, 8),
+ (12, 4) => $expand!(12, 4),
+ (12, 8) => $expand!(12, 8),
+ (13, 4) => $expand!(13, 4),
+ (13, 8) => $expand!(13, 8),
+ (14, 4) => $expand!(14, 4),
+ (14, 8) => $expand!(14, 8),
+ (15, 4) => $expand!(15, 4),
+ (15, 8) => $expand!(15, 8),
+ (16, 4) => $expand!(16, 4),
+ (16, 8) => $expand!(16, 8),
+ (17, 4) => $expand!(17, 4),
+ (17, 8) => $expand!(17, 8),
+ (18, 4) => $expand!(18, 4),
+ (18, 8) => $expand!(18, 8),
+ (19, 4) => $expand!(19, 4),
+ (19, 8) => $expand!(19, 8),
+ (20, 4) => $expand!(20, 4),
+ (20, 8) => $expand!(20, 8),
+ (21, 4) => $expand!(21, 4),
+ (21, 8) => $expand!(21, 8),
+ (22, 4) => $expand!(22, 4),
+ (22, 8) => $expand!(22, 8),
+ (23, 4) => $expand!(23, 4),
+ (23, 8) => $expand!(23, 8),
+ (24, 4) => $expand!(24, 4),
+ (24, 8) => $expand!(24, 8),
+ (25, 4) => $expand!(25, 4),
+ (25, 8) => $expand!(25, 8),
+ (26, 4) => $expand!(26, 4),
+ (26, 8) => $expand!(26, 8),
+ (27, 4) => $expand!(27, 4),
+ (27, 8) => $expand!(27, 8),
+ (28, 4) => $expand!(28, 4),
+ (28, 8) => $expand!(28, 8),
+ (29, 4) => $expand!(29, 4),
+ (29, 8) => $expand!(29, 8),
+ (30, 4) => $expand!(30, 4),
+ (30, 8) => $expand!(30, 8),
+ (31, 4) => $expand!(31, 4),
+ (31, 8) => $expand!(31, 8),
+ (32, 4) => $expand!(32, 4),
+ (32, 8) => $expand!(32, 8),
+ (33, 4) => $expand!(33, 4),
+ (33, 8) => $expand!(33, 8),
+ (34, 4) => $expand!(34, 4),
+ (34, 8) => $expand!(34, 8),
+ (35, 4) => $expand!(35, 4),
+ (35, 8) => $expand!(35, 8),
+ (36, 4) => $expand!(36, 4),
+ (36, 8) => $expand!(36, 8),
+ (37, 4) => $expand!(37, 4),
+ (37, 8) => $expand!(37, 8),
+ (38, 4) => $expand!(38, 4),
+ (38, 8) => $expand!(38, 8),
+ (39, 4) => $expand!(39, 4),
+ (39, 8) => $expand!(39, 8),
+ (40, 4) => $expand!(40, 4),
+ (40, 8) => $expand!(40, 8),
+ (41, 4) => $expand!(41, 4),
+ (41, 8) => $expand!(41, 8),
+ (42, 4) => $expand!(42, 4),
+ (42, 8) => $expand!(42, 8),
+ (43, 4) => $expand!(43, 4),
+ (43, 8) => $expand!(43, 8),
+ (44, 4) => $expand!(44, 4),
+ (44, 8) => $expand!(44, 8),
+ (45, 4) => $expand!(45, 4),
+ (45, 8) => $expand!(45, 8),
+ (46, 4) => $expand!(46, 4),
+ (46, 8) => $expand!(46, 8),
+ (47, 4) => $expand!(47, 4),
+ (47, 8) => $expand!(47, 8),
+ (48, 4) => $expand!(48, 4),
+ (48, 8) => $expand!(48, 8),
+ (49, 4) => $expand!(49, 4),
+ (49, 8) => $expand!(49, 8),
+ (50, 4) => $expand!(50, 4),
+ (50, 8) => $expand!(50, 8),
+ (51, 4) => $expand!(51, 4),
+ (51, 8) => $expand!(51, 8),
+ (52, 4) => $expand!(52, 4),
+ (52, 8) => $expand!(52, 8),
+ (53, 4) => $expand!(53, 4),
+ (53, 8) => $expand!(53, 8),
+ (54, 4) => $expand!(54, 4),
+ (54, 8) => $expand!(54, 8),
+ (55, 4) => $expand!(55, 4),
+ (55, 8) => $expand!(55, 8),
+ (56, 4) => $expand!(56, 4),
+ (56, 8) => $expand!(56, 8),
+ (57, 4) => $expand!(57, 4),
+ (57, 8) => $expand!(57, 8),
+ (58, 4) => $expand!(58, 4),
+ (58, 8) => $expand!(58, 8),
+ (59, 4) => $expand!(59, 4),
+ (59, 8) => $expand!(59, 8),
+ (60, 4) => $expand!(60, 4),
+ (60, 8) => $expand!(60, 8),
+ (61, 4) => $expand!(61, 4),
+ (61, 8) => $expand!(61, 8),
+ (62, 4) => $expand!(62, 4),
+ (62, 8) => $expand!(62, 8),
+ (63, 4) => $expand!(63, 4),
+ (63, 8) => $expand!(63, 8),
+ (64, 4) => $expand!(64, 4),
+ (64, 8) => $expand!(64, 8),
+ (65, 4) => $expand!(65, 4),
+ (65, 8) => $expand!(65, 8),
+ (66, 4) => $expand!(66, 4),
+ (66, 8) => $expand!(66, 8),
+ (67, 4) => $expand!(67, 4),
+ (67, 8) => $expand!(67, 8),
+ (68, 4) => $expand!(68, 4),
+ (68, 8) => $expand!(68, 8),
+ (69, 4) => $expand!(69, 4),
+ (69, 8) => $expand!(69, 8),
+ (70, 4) => $expand!(70, 4),
+ (70, 8) => $expand!(70, 8),
+ (71, 4) => $expand!(71, 4),
+ (71, 8) => $expand!(71, 8),
+ (72, 4) => $expand!(72, 4),
+ (72, 8) => $expand!(72, 8),
+ (73, 4) => $expand!(73, 4),
+ (73, 8) => $expand!(73, 8),
+ (74, 4) => $expand!(74, 4),
+ (74, 8) => $expand!(74, 8),
+ (75, 4) => $expand!(75, 4),
+ (75, 8) => $expand!(75, 8),
+ (76, 4) => $expand!(76, 4),
+ (76, 8) => $expand!(76, 8),
+ (77, 4) => $expand!(77, 4),
+ (77, 8) => $expand!(77, 8),
+ (78, 4) => $expand!(78, 4),
+ (78, 8) => $expand!(78, 8),
+ (79, 4) => $expand!(79, 4),
+ (79, 8) => $expand!(79, 8),
+ (80, 4) => $expand!(80, 4),
+ (80, 8) => $expand!(80, 8),
+ (81, 4) => $expand!(81, 4),
+ (81, 8) => $expand!(81, 8),
+ (82, 4) => $expand!(82, 4),
+ (82, 8) => $expand!(82, 8),
+ (83, 4) => $expand!(83, 4),
+ (83, 8) => $expand!(83, 8),
+ (84, 4) => $expand!(84, 4),
+ (84, 8) => $expand!(84, 8),
+ (85, 4) => $expand!(85, 4),
+ (85, 8) => $expand!(85, 8),
+ (86, 4) => $expand!(86, 4),
+ (86, 8) => $expand!(86, 8),
+ (87, 4) => $expand!(87, 4),
+ (87, 8) => $expand!(87, 8),
+ (88, 4) => $expand!(88, 4),
+ (88, 8) => $expand!(88, 8),
+ (89, 4) => $expand!(89, 4),
+ (89, 8) => $expand!(89, 8),
+ (90, 4) => $expand!(90, 4),
+ (90, 8) => $expand!(90, 8),
+ (91, 4) => $expand!(91, 4),
+ (91, 8) => $expand!(91, 8),
+ (92, 4) => $expand!(92, 4),
+ (92, 8) => $expand!(92, 8),
+ (93, 4) => $expand!(93, 4),
+ (93, 8) => $expand!(93, 8),
+ (94, 4) => $expand!(94, 4),
+ (94, 8) => $expand!(94, 8),
+ (95, 4) => $expand!(95, 4),
+ (95, 8) => $expand!(95, 8),
+ (96, 4) => $expand!(96, 4),
+ (96, 8) => $expand!(96, 8),
+ (97, 4) => $expand!(97, 4),
+ (97, 8) => $expand!(97, 8),
+ (98, 4) => $expand!(98, 4),
+ (98, 8) => $expand!(98, 8),
+ (99, 4) => $expand!(99, 4),
+ (99, 8) => $expand!(99, 8),
+ (100, 4) => $expand!(100, 4),
+ (100, 8) => $expand!(100, 8),
+ (101, 4) => $expand!(101, 4),
+ (101, 8) => $expand!(101, 8),
+ (102, 4) => $expand!(102, 4),
+ (102, 8) => $expand!(102, 8),
+ (103, 4) => $expand!(103, 4),
+ (103, 8) => $expand!(103, 8),
+ (104, 4) => $expand!(104, 4),
+ (104, 8) => $expand!(104, 8),
+ (105, 4) => $expand!(105, 4),
+ (105, 8) => $expand!(105, 8),
+ (106, 4) => $expand!(106, 4),
+ (106, 8) => $expand!(106, 8),
+ (107, 4) => $expand!(107, 4),
+ (107, 8) => $expand!(107, 8),
+ (108, 4) => $expand!(108, 4),
+ (108, 8) => $expand!(108, 8),
+ (109, 4) => $expand!(109, 4),
+ (109, 8) => $expand!(109, 8),
+ (110, 4) => $expand!(110, 4),
+ (110, 8) => $expand!(110, 8),
+ (111, 4) => $expand!(111, 4),
+ (111, 8) => $expand!(111, 8),
+ (112, 4) => $expand!(112, 4),
+ (112, 8) => $expand!(112, 8),
+ (113, 4) => $expand!(113, 4),
+ (113, 8) => $expand!(113, 8),
+ (114, 4) => $expand!(114, 4),
+ (114, 8) => $expand!(114, 8),
+ (115, 4) => $expand!(115, 4),
+ (115, 8) => $expand!(115, 8),
+ (116, 4) => $expand!(116, 4),
+ (116, 8) => $expand!(116, 8),
+ (117, 4) => $expand!(117, 4),
+ (117, 8) => $expand!(117, 8),
+ (118, 4) => $expand!(118, 4),
+ (118, 8) => $expand!(118, 8),
+ (119, 4) => $expand!(119, 4),
+ (119, 8) => $expand!(119, 8),
+ (120, 4) => $expand!(120, 4),
+ (120, 8) => $expand!(120, 8),
+ (121, 4) => $expand!(121, 4),
+ (121, 8) => $expand!(121, 8),
+ (122, 4) => $expand!(122, 4),
+ (122, 8) => $expand!(122, 8),
+ (123, 4) => $expand!(123, 4),
+ (123, 8) => $expand!(123, 8),
+ (124, 4) => $expand!(124, 4),
+ (124, 8) => $expand!(124, 8),
+ (125, 4) => $expand!(125, 4),
+ (125, 8) => $expand!(125, 8),
+ (126, 4) => $expand!(126, 4),
+ (126, 8) => $expand!(126, 8),
+ (127, 4) => $expand!(127, 4),
+ (127, 8) => $expand!(127, 8),
+ (128, 4) => $expand!(128, 4),
+ (128, 8) => $expand!(128, 8),
+ (129, 4) => $expand!(129, 4),
+ (129, 8) => $expand!(129, 8),
+ (130, 4) => $expand!(130, 4),
+ (130, 8) => $expand!(130, 8),
+ (131, 4) => $expand!(131, 4),
+ (131, 8) => $expand!(131, 8),
+ (132, 4) => $expand!(132, 4),
+ (132, 8) => $expand!(132, 8),
+ (133, 4) => $expand!(133, 4),
+ (133, 8) => $expand!(133, 8),
+ (134, 4) => $expand!(134, 4),
+ (134, 8) => $expand!(134, 8),
+ (135, 4) => $expand!(135, 4),
+ (135, 8) => $expand!(135, 8),
+ (136, 4) => $expand!(136, 4),
+ (136, 8) => $expand!(136, 8),
+ (137, 4) => $expand!(137, 4),
+ (137, 8) => $expand!(137, 8),
+ (138, 4) => $expand!(138, 4),
+ (138, 8) => $expand!(138, 8),
+ (139, 4) => $expand!(139, 4),
+ (139, 8) => $expand!(139, 8),
+ (140, 4) => $expand!(140, 4),
+ (140, 8) => $expand!(140, 8),
+ (141, 4) => $expand!(141, 4),
+ (141, 8) => $expand!(141, 8),
+ (142, 4) => $expand!(142, 4),
+ (142, 8) => $expand!(142, 8),
+ (143, 4) => $expand!(143, 4),
+ (143, 8) => $expand!(143, 8),
+ (144, 4) => $expand!(144, 4),
+ (144, 8) => $expand!(144, 8),
+ (145, 4) => $expand!(145, 4),
+ (145, 8) => $expand!(145, 8),
+ (146, 4) => $expand!(146, 4),
+ (146, 8) => $expand!(146, 8),
+ (147, 4) => $expand!(147, 4),
+ (147, 8) => $expand!(147, 8),
+ (148, 4) => $expand!(148, 4),
+ (148, 8) => $expand!(148, 8),
+ (149, 4) => $expand!(149, 4),
+ (149, 8) => $expand!(149, 8),
+ (150, 4) => $expand!(150, 4),
+ (150, 8) => $expand!(150, 8),
+ (151, 4) => $expand!(151, 4),
+ (151, 8) => $expand!(151, 8),
+ (152, 4) => $expand!(152, 4),
+ (152, 8) => $expand!(152, 8),
+ (153, 4) => $expand!(153, 4),
+ (153, 8) => $expand!(153, 8),
+ (154, 4) => $expand!(154, 4),
+ (154, 8) => $expand!(154, 8),
+ (155, 4) => $expand!(155, 4),
+ (155, 8) => $expand!(155, 8),
+ (156, 4) => $expand!(156, 4),
+ (156, 8) => $expand!(156, 8),
+ (157, 4) => $expand!(157, 4),
+ (157, 8) => $expand!(157, 8),
+ (158, 4) => $expand!(158, 4),
+ (158, 8) => $expand!(158, 8),
+ (159, 4) => $expand!(159, 4),
+ (159, 8) => $expand!(159, 8),
+ (160, 4) => $expand!(160, 4),
+ (160, 8) => $expand!(160, 8),
+ (161, 4) => $expand!(161, 4),
+ (161, 8) => $expand!(161, 8),
+ (162, 4) => $expand!(162, 4),
+ (162, 8) => $expand!(162, 8),
+ (163, 4) => $expand!(163, 4),
+ (163, 8) => $expand!(163, 8),
+ (164, 4) => $expand!(164, 4),
+ (164, 8) => $expand!(164, 8),
+ (165, 4) => $expand!(165, 4),
+ (165, 8) => $expand!(165, 8),
+ (166, 4) => $expand!(166, 4),
+ (166, 8) => $expand!(166, 8),
+ (167, 4) => $expand!(167, 4),
+ (167, 8) => $expand!(167, 8),
+ (168, 4) => $expand!(168, 4),
+ (168, 8) => $expand!(168, 8),
+ (169, 4) => $expand!(169, 4),
+ (169, 8) => $expand!(169, 8),
+ (170, 4) => $expand!(170, 4),
+ (170, 8) => $expand!(170, 8),
+ (171, 4) => $expand!(171, 4),
+ (171, 8) => $expand!(171, 8),
+ (172, 4) => $expand!(172, 4),
+ (172, 8) => $expand!(172, 8),
+ (173, 4) => $expand!(173, 4),
+ (173, 8) => $expand!(173, 8),
+ (174, 4) => $expand!(174, 4),
+ (174, 8) => $expand!(174, 8),
+ (175, 4) => $expand!(175, 4),
+ (175, 8) => $expand!(175, 8),
+ (176, 4) => $expand!(176, 4),
+ (176, 8) => $expand!(176, 8),
+ (177, 4) => $expand!(177, 4),
+ (177, 8) => $expand!(177, 8),
+ (178, 4) => $expand!(178, 4),
+ (178, 8) => $expand!(178, 8),
+ (179, 4) => $expand!(179, 4),
+ (179, 8) => $expand!(179, 8),
+ (180, 4) => $expand!(180, 4),
+ (180, 8) => $expand!(180, 8),
+ (181, 4) => $expand!(181, 4),
+ (181, 8) => $expand!(181, 8),
+ (182, 4) => $expand!(182, 4),
+ (182, 8) => $expand!(182, 8),
+ (183, 4) => $expand!(183, 4),
+ (183, 8) => $expand!(183, 8),
+ (184, 4) => $expand!(184, 4),
+ (184, 8) => $expand!(184, 8),
+ (185, 4) => $expand!(185, 4),
+ (185, 8) => $expand!(185, 8),
+ (186, 4) => $expand!(186, 4),
+ (186, 8) => $expand!(186, 8),
+ (187, 4) => $expand!(187, 4),
+ (187, 8) => $expand!(187, 8),
+ (188, 4) => $expand!(188, 4),
+ (188, 8) => $expand!(188, 8),
+ (189, 4) => $expand!(189, 4),
+ (189, 8) => $expand!(189, 8),
+ (190, 4) => $expand!(190, 4),
+ (190, 8) => $expand!(190, 8),
+ (191, 4) => $expand!(191, 4),
+ (191, 8) => $expand!(191, 8),
+ (192, 4) => $expand!(192, 4),
+ (192, 8) => $expand!(192, 8),
+ (193, 4) => $expand!(193, 4),
+ (193, 8) => $expand!(193, 8),
+ (194, 4) => $expand!(194, 4),
+ (194, 8) => $expand!(194, 8),
+ (195, 4) => $expand!(195, 4),
+ (195, 8) => $expand!(195, 8),
+ (196, 4) => $expand!(196, 4),
+ (196, 8) => $expand!(196, 8),
+ (197, 4) => $expand!(197, 4),
+ (197, 8) => $expand!(197, 8),
+ (198, 4) => $expand!(198, 4),
+ (198, 8) => $expand!(198, 8),
+ (199, 4) => $expand!(199, 4),
+ (199, 8) => $expand!(199, 8),
+ (200, 4) => $expand!(200, 4),
+ (200, 8) => $expand!(200, 8),
+ (201, 4) => $expand!(201, 4),
+ (201, 8) => $expand!(201, 8),
+ (202, 4) => $expand!(202, 4),
+ (202, 8) => $expand!(202, 8),
+ (203, 4) => $expand!(203, 4),
+ (203, 8) => $expand!(203, 8),
+ (204, 4) => $expand!(204, 4),
+ (204, 8) => $expand!(204, 8),
+ (205, 4) => $expand!(205, 4),
+ (205, 8) => $expand!(205, 8),
+ (206, 4) => $expand!(206, 4),
+ (206, 8) => $expand!(206, 8),
+ (207, 4) => $expand!(207, 4),
+ (207, 8) => $expand!(207, 8),
+ (208, 4) => $expand!(208, 4),
+ (208, 8) => $expand!(208, 8),
+ (209, 4) => $expand!(209, 4),
+ (209, 8) => $expand!(209, 8),
+ (210, 4) => $expand!(210, 4),
+ (210, 8) => $expand!(210, 8),
+ (211, 4) => $expand!(211, 4),
+ (211, 8) => $expand!(211, 8),
+ (212, 4) => $expand!(212, 4),
+ (212, 8) => $expand!(212, 8),
+ (213, 4) => $expand!(213, 4),
+ (213, 8) => $expand!(213, 8),
+ (214, 4) => $expand!(214, 4),
+ (214, 8) => $expand!(214, 8),
+ (215, 4) => $expand!(215, 4),
+ (215, 8) => $expand!(215, 8),
+ (216, 4) => $expand!(216, 4),
+ (216, 8) => $expand!(216, 8),
+ (217, 4) => $expand!(217, 4),
+ (217, 8) => $expand!(217, 8),
+ (218, 4) => $expand!(218, 4),
+ (218, 8) => $expand!(218, 8),
+ (219, 4) => $expand!(219, 4),
+ (219, 8) => $expand!(219, 8),
+ (220, 4) => $expand!(220, 4),
+ (220, 8) => $expand!(220, 8),
+ (221, 4) => $expand!(221, 4),
+ (221, 8) => $expand!(221, 8),
+ (222, 4) => $expand!(222, 4),
+ (222, 8) => $expand!(222, 8),
+ (223, 4) => $expand!(223, 4),
+ (223, 8) => $expand!(223, 8),
+ (224, 4) => $expand!(224, 4),
+ (224, 8) => $expand!(224, 8),
+ (225, 4) => $expand!(225, 4),
+ (225, 8) => $expand!(225, 8),
+ (226, 4) => $expand!(226, 4),
+ (226, 8) => $expand!(226, 8),
+ (227, 4) => $expand!(227, 4),
+ (227, 8) => $expand!(227, 8),
+ (228, 4) => $expand!(228, 4),
+ (228, 8) => $expand!(228, 8),
+ (229, 4) => $expand!(229, 4),
+ (229, 8) => $expand!(229, 8),
+ (230, 4) => $expand!(230, 4),
+ (230, 8) => $expand!(230, 8),
+ (231, 4) => $expand!(231, 4),
+ (231, 8) => $expand!(231, 8),
+ (232, 4) => $expand!(232, 4),
+ (232, 8) => $expand!(232, 8),
+ (233, 4) => $expand!(233, 4),
+ (233, 8) => $expand!(233, 8),
+ (234, 4) => $expand!(234, 4),
+ (234, 8) => $expand!(234, 8),
+ (235, 4) => $expand!(235, 4),
+ (235, 8) => $expand!(235, 8),
+ (236, 4) => $expand!(236, 4),
+ (236, 8) => $expand!(236, 8),
+ (237, 4) => $expand!(237, 4),
+ (237, 8) => $expand!(237, 8),
+ (238, 4) => $expand!(238, 4),
+ (238, 8) => $expand!(238, 8),
+ (239, 4) => $expand!(239, 4),
+ (239, 8) => $expand!(239, 8),
+ (240, 4) => $expand!(240, 4),
+ (240, 8) => $expand!(240, 8),
+ (241, 4) => $expand!(241, 4),
+ (241, 8) => $expand!(241, 8),
+ (242, 4) => $expand!(242, 4),
+ (242, 8) => $expand!(242, 8),
+ (243, 4) => $expand!(243, 4),
+ (243, 8) => $expand!(243, 8),
+ (244, 4) => $expand!(244, 4),
+ (244, 8) => $expand!(244, 8),
+ (245, 4) => $expand!(245, 4),
+ (245, 8) => $expand!(245, 8),
+ (246, 4) => $expand!(246, 4),
+ (246, 8) => $expand!(246, 8),
+ (247, 4) => $expand!(247, 4),
+ (247, 8) => $expand!(247, 8),
+ (248, 4) => $expand!(248, 4),
+ (248, 8) => $expand!(248, 8),
+ (249, 4) => $expand!(249, 4),
+ (249, 8) => $expand!(249, 8),
+ (250, 4) => $expand!(250, 4),
+ (250, 8) => $expand!(250, 8),
+ (251, 4) => $expand!(251, 4),
+ (251, 8) => $expand!(251, 8),
+ (252, 4) => $expand!(252, 4),
+ (252, 8) => $expand!(252, 8),
+ (253, 4) => $expand!(253, 4),
+ (253, 8) => $expand!(253, 8),
+ (254, 4) => $expand!(254, 4),
+ (254, 8) => $expand!(254, 8),
+ (255, 4) => $expand!(255, 4),
+ (255, 8) => $expand!(255, 8),
+ (_, _) => panic!("Invalid sae value"),
+ }
+ };
+}
+
#[cfg(test)]
macro_rules! assert_approx_eq {
($a:expr, $b:expr, $eps:expr) => {{
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 1eca091c6d..6efe80b62a 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -2686,6 +2686,35 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_roundscale_round_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_roundscale_round_pd(a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_roundscale_round_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_mask_roundscale_round_pd(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(1.1);
+ assert_eq_m512d(r, e);
+ let r = _mm512_mask_roundscale_round_pd(a, 0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_roundscale_round_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_maskz_roundscale_round_pd(0, a, 0, _MM_FROUND_CUR_DIRECTION);
+ assert_eq_m512d(r, _mm512_setzero_pd());
+ let r = _mm512_maskz_roundscale_round_pd(0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_pd() {
let a = _mm512_set1_pd(10.);
From 751acda5253fc0dcf3e20c1c6cf1927bb5c0c0bd Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 13:37:24 +0000
Subject: [PATCH 02/25] roundscale: ps,pd
---
crates/core_arch/avx512f.md | 12 +-
crates/core_arch/src/x86/avx512f.rs | 208 +++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 29 ++++
3 files changed, 243 insertions(+), 6 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 58be7ab632..50f8179955 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -607,8 +607,8 @@
* [x] [`_mm512_mask_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=5236)
* [x] [`_mm512_mask_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi32&expand=5236)
* [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236)
- * [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
- * [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
+ * [x] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236)
+ * [x] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236)
* [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236)
* [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
* [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
@@ -879,8 +879,8 @@
* [x] [`_mm512_maskz_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=5236)
* [x] [`_mm512_maskz_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi32&expand=5236)
* [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236)
- * [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
- * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
+ * [x] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236)
+ * [x] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236)
* [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236)
* [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
* [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
@@ -1017,8 +1017,8 @@
* [x] [`_mm512_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=5236)
* [x] [`_mm512_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi32&expand=5236)
* [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236)
- * [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
- * [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
+ * [x] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236)
+ * [x] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236)
* [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236)
* [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
* [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 90b707d93c..1077f18c44 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -1969,6 +1969,185 @@ pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
))
}
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_ps&expand=4784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm512_roundscale_ps(a: __m512, imm8: i32) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaleps(
+ a.as_f32x16(),
+ $imm8,
+ _mm512_setzero_ps().as_f32x16(),
+ 0b11111111_11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_ps&expand=4782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_mask_roundscale_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaleps(
+ a.as_f32x16(),
+ $imm8,
+ src.as_f32x16(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_ps&expand=4783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_maskz_roundscale_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaleps(
+ a.as_f32x16(),
+ $imm8,
+ _mm512_setzero_ps().as_f32x16(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_pd&expand=4775)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(1)]
+pub unsafe fn _mm512_roundscale_pd(a: __m512d, imm8: i32) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalepd(
+ a.as_f64x8(),
+ $imm8,
+ _mm512_setzero_pd().as_f64x8(),
+ 0b11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_pd&expand=4773)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_mask_roundscale_pd(
+ src: __m512d,
+ k: __mmask8,
+ a: __m512d,
+ imm8: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalepd(
+ a.as_f64x8(),
+ $imm8,
+ src.as_f64x8(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_pd&expand=4774)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalepd(
+ a.as_f64x8(),
+ $imm8,
+ _mm512_setzero_pd().as_f64x8(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -19470,6 +19649,35 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_roundscale_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_roundscale_ps(a, 0);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_roundscale_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_mask_roundscale_ps(a, 0, a, 0);
+ let e = _mm512_set1_ps(1.1);
+ assert_eq_m512(r, e);
+ let r = _mm512_mask_roundscale_ps(a, 0b11111111_11111111, a, 0);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_roundscale_ps() {
+ let a = _mm512_set1_ps(1.1);
+ let r = _mm512_maskz_roundscale_ps(0, a, 0);
+ assert_eq_m512(r, _mm512_setzero_ps());
+ let r = _mm512_maskz_roundscale_ps(0b11111111_11111111, a, 0);
+ let e = _mm512_set1_ps(1.0);
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_ps() {
let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 6efe80b62a..b653a55bcb 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -995,6 +995,35 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_roundscale_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_roundscale_pd(a, 0);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_roundscale_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_mask_roundscale_pd(a, 0, a, 0);
+ let e = _mm512_set1_pd(1.1);
+ assert_eq_m512d(r, e);
+ let r = _mm512_mask_roundscale_pd(a, 0b11111111, a, 0);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_roundscale_pd() {
+ let a = _mm512_set1_pd(1.1);
+ let r = _mm512_maskz_roundscale_pd(0, a, 0);
+ assert_eq_m512d(r, _mm512_setzero_pd());
+ let r = _mm512_maskz_roundscale_pd(0b11111111, a, 0);
+ let e = _mm512_set1_pd(1.0);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_pd() {
let a = _mm512_set1_pd(10.);
From b9800ad39407eeb105fa5cb5f9475ae3af5ed0c7 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 14:07:20 +0000
Subject: [PATCH 03/25] scalef_round: ps,pd; scalef: ps,pd
---
crates/core_arch/avx512f.md | 24 +-
crates/core_arch/src/x86/avx512f.rs | 373 +++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 75 +++++
3 files changed, 460 insertions(+), 12 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 50f8179955..ab9e4d9545 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -613,10 +613,10 @@
* [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236)
* [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236)
* [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236)
- * [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
- * [ ] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
- * [ ] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
- * [ ] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
+ * [x] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236)
+ * [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
+ * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
+ * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
* [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
* [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
* [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
@@ -885,10 +885,10 @@
* [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236)
* [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236)
* [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236)
- * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
- * [ ] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
- * [ ] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
- * [ ] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
+ * [x] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236)
+ * [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
+ * [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
+ * [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
* [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
* [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
* [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236)
@@ -1023,10 +1023,10 @@
* [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236)
* [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236)
* [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236)
- * [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
- * [ ] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236)
- * [ ] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236)
- * [ ] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236)
+ * [x] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236)
+ * [x] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236)
+ * [x] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236)
+ * [x] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236)
* [x] [`_mm512_set1_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=5236)
* [x] [`_mm512_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32&expand=5236)
* [x] [`_mm512_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 1077f18c44..0a91c92706 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2148,6 +2148,102 @@ pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) ->
transmute(r)
}
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_ps&expand=4883)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
+ transmute(vscalefps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ _mm512_setzero_ps().as_f32x16(),
+ 0b11111111_11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_ps&expand=4881)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
+ transmute(vscalefps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ src.as_f32x16(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_ps&expand=4882)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps))]
+pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
+ transmute(vscalefps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ _mm512_setzero_ps().as_f32x16(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_pd&expand=4874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
+ transmute(vscalefpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ _mm512_setzero_pd().as_f64x8(),
+ 0b11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_pd&expand=4872)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+ transmute(vscalefpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ src.as_f64x8(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_pd&expand=4873)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd))]
+pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
+ transmute(vscalefpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ _mm512_setzero_pd().as_f64x8(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -5214,6 +5310,196 @@ pub unsafe fn _mm512_maskz_roundscale_round_pd(
transmute(r)
}
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_ps&expand=4889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_scalef_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ _mm512_setzero_ps().as_f32x16(),
+ 0b11111111_11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_ps&expand=4887)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_scalef_round_ps(
+ src: __m512,
+ k: __mmask16,
+ a: __m512,
+ b: __m512,
+ rounding: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefps(a.as_f32x16(), b.as_f32x16(), src.as_f32x16(), k, $imm4)
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_ps&expand=4888)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_scalef_round_ps(
+ k: __mmask16,
+ a: __m512,
+ b: __m512,
+ rounding: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ _mm512_setzero_ps().as_f32x16(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_pd&expand=4886)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm512_scalef_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ _mm512_setzero_pd().as_f64x8(),
+ 0b11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_pd&expand=4884)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_scalef_round_pd(
+ src: __m512d,
+ k: __mmask8,
+ a: __m512d,
+ b: __m512d,
+ rounding: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefpd(a.as_f64x8(), b.as_f64x8(), src.as_f64x8(), k, $imm4)
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_pd&expand=4885)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_maskz_scalef_round_pd(
+ k: __mmask8,
+ a: __m512d,
+ b: __m512d,
+ rounding: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ _mm512_setzero_pd().as_f64x8(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -17579,6 +17865,10 @@ extern "C" {
fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
+ #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
+ fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
+ #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
+ fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
#[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
@@ -19678,6 +19968,41 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_scalef_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r = _mm512_scalef_ps(a, b);
+ let e = _mm512_set1_ps(8.);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_scalef_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r = _mm512_mask_scalef_ps(a, 0, a, b);
+ assert_eq_m512(r, a);
+ let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
+ let e = _mm512_set_ps(
+ 8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_scalef_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r = _mm512_maskz_scalef_ps(0, a, b);
+ assert_eq_m512(r, _mm512_setzero_ps());
+ let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
+ let e = _mm512_set_ps(
+ 8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_ps() {
let a = _mm512_set1_ps(10.);
@@ -21064,6 +21389,54 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_scalef_round_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r = _mm512_scalef_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm512_set1_ps(8.);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_scalef_round_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r =
+ _mm512_mask_scalef_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m512(r, a);
+ let r = _mm512_mask_scalef_round_ps(
+ a,
+ 0b11111111_00000000,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm512_set_ps(
+ 8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_scalef_round_ps() {
+ let a = _mm512_set1_ps(1.);
+ let b = _mm512_set1_ps(3.);
+ let r =
+ _mm512_maskz_scalef_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m512(r, _mm512_setzero_ps());
+ let r = _mm512_maskz_scalef_round_ps(
+ 0b11111111_00000000,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm512_set_ps(
+ 8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_ps() {
let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index b653a55bcb..f6f80a323c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1024,6 +1024,37 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_scalef_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r = _mm512_scalef_pd(a, b);
+ let e = _mm512_set1_pd(8.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_scalef_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r = _mm512_mask_scalef_pd(a, 0, a, b);
+ assert_eq_m512d(r, a);
+ let r = _mm512_mask_scalef_pd(a, 0b11110000, a, b);
+ let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_scalef_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r = _mm512_maskz_scalef_pd(0, a, b);
+ assert_eq_m512d(r, _mm512_setzero_pd());
+ let r = _mm512_maskz_scalef_pd(0b11110000, a, b);
+ let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_pd() {
let a = _mm512_set1_pd(10.);
@@ -2744,6 +2775,50 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_scalef_round_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r = _mm512_scalef_round_pd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm512_set1_pd(8.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_scalef_round_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r =
+ _mm512_mask_scalef_round_pd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m512d(r, a);
+ let r = _mm512_mask_scalef_round_pd(
+ a,
+ 0b11110000,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_scalef_round_pd() {
+ let a = _mm512_set1_pd(1.);
+ let b = _mm512_set1_pd(3.);
+ let r =
+ _mm512_maskz_scalef_round_pd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m512d(r, _mm512_setzero_pd());
+ let r = _mm512_maskz_scalef_round_pd(
+ 0b11110000,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_pd() {
let a = _mm512_set1_pd(10.);
From a6274653d84895583c23f9e84ef853144ef59b67 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 15:49:39 +0000
Subject: [PATCH 04/25] reduce_mul: epi64, reduce_max: epi64,epu64, reduce_min:
epi64,epu64, reduce_and: epi64, reduce_or: epi64
---
crates/core_arch/avx512f.md | 32 ++--
crates/core_arch/src/x86/avx512f.rs | 178 ++++++++++++++++++++++-
crates/core_arch/src/x86_64/avx512f.rs | 105 +++++++++++++
crates/stdarch-verify/tests/x86-intel.rs | 17 ++-
4 files changed, 310 insertions(+), 22 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ab9e4d9545..4184997d23 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -576,29 +576,29 @@
* [x] [`_mm512_mask_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=5236)
* [x] [`_mm512_mask_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=5236)
* [x] [`_mm512_mask_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236)
* [x] [`_mm512_mask_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_pd&expand=5236)
* [x] [`_mm512_mask_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_ps&expand=5236)
* [x] [`_mm512_mask_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236)
* [x] [`_mm512_mask_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236)
* [x] [`_mm512_mask_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu32&expand=5236)
- * [ ] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236)
+ * [x] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236)
* [x] [`_mm512_mask_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_pd&expand=5236)
* [x] [`_mm512_mask_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_ps&expand=5236)
* [x] [`_mm512_mask_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236)
* [x] [`_mm512_mask_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu32&expand=5236)
- * [ ] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236)
+ * [x] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236)
* [x] [`_mm512_mask_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_pd&expand=5236)
* [x] [`_mm512_mask_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_ps&expand=5236)
* [x] [`_mm512_mask_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236)
* [x] [`_mm512_mask_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_pd&expand=5236)
* [x] [`_mm512_mask_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_ps&expand=5236)
* [x] [`_mm512_mask_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi32&expand=5236)
- * [ ] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236)
+ * [x] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236)
* [x] [`_mm512_mask_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=5236)
* [x] [`_mm512_mask_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=5236)
* [x] [`_mm512_mask_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi32&expand=5236)
@@ -986,29 +986,29 @@
* [x] [`_mm512_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=5236)
* [x] [`_mm512_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=5236)
* [x] [`_mm512_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=5236)
- * [ ] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236)
+ * [x] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236)
* [x] [`_mm512_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=5236)
* [x] [`_mm512_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ps&expand=5236)
* [x] [`_mm512_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi32&expand=5236)
- * [ ] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236)
+ * [x] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236)
* [x] [`_mm512_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi32&expand=5236)
- * [ ] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236)
+ * [x] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236)
* [x] [`_mm512_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu32&expand=5236)
- * [ ] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236)
+ * [x] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236)
* [x] [`_mm512_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_pd&expand=5236)
* [x] [`_mm512_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ps&expand=5236)
* [x] [`_mm512_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi32&expand=5236)
- * [ ] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236)
+ * [x] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236)
* [x] [`_mm512_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu32&expand=5236)
- * [ ] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236)
+ * [x] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236)
* [x] [`_mm512_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_pd&expand=5236)
* [x] [`_mm512_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ps&expand=5236)
* [x] [`_mm512_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi32&expand=5236)
- * [ ] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236)
+ * [x] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236)
* [x] [`_mm512_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_pd&expand=5236)
* [x] [`_mm512_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ps&expand=5236)
* [x] [`_mm512_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi32&expand=5236)
- * [ ] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236)
+ * [x] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236)
* [x] [`_mm512_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=5236)
* [x] [`_mm512_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=5236)
* [x] [`_mm512_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi32&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 0a91c92706..95c3794b09 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -16843,6 +16843,19 @@ pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
simd_reduce_add_unordered(a.as_i32x16())
}
+/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+ simd_reduce_add_unordered(simd_select_bitmask(
+ k,
+ a.as_i32x16(),
+ _mm512_setzero_si512().as_i32x16(),
+ ))
+}
+
/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_epi64&expand=4558)
@@ -16852,16 +16865,16 @@ pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
simd_reduce_add_unordered(a.as_i64x8())
}
-/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
#[inline]
#[target_feature(enable = "avx512f")]
-pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
+pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
simd_reduce_add_unordered(simd_select_bitmask(
k,
- a.as_i32x16(),
- _mm512_setzero_si512().as_i32x16(),
+ a.as_i64x8(),
+ _mm512_setzero_si512().as_i64x8(),
))
}
@@ -16931,6 +16944,28 @@ pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
))
}
+/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_epi64&expand=4602)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
+ simd_reduce_mul_unordered(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_epi64&expand=4601)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
+ simd_reduce_mul_unordered(simd_select_bitmask(
+ k,
+ a.as_i64x8(),
+ _mm512_set1_epi64(1).as_i64x8(),
+ ))
+}
+
/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_ps&expand=4606)
@@ -16997,6 +17032,28 @@ pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
))
}
+/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epi64&expand=4578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
+ simd_reduce_max(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epi64&expand=4577)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
+ simd_reduce_max(simd_select_bitmask(
+ k,
+ a.as_i64x8(),
+ _mm512_set1_epi64(0).as_i64x8(),
+ ))
+}
+
/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu32&expand=4580)
@@ -17019,6 +17076,28 @@ pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
))
}
+/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu64&expand=4582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
+ simd_reduce_max(a.as_u64x8())
+}
+
+/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epu64&expand=4581)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
+ simd_reduce_max(simd_select_bitmask(
+ k,
+ a.as_u64x8(),
+ _mm512_set1_epi64(0).as_u64x8(),
+ ))
+}
+
/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_ps&expand=4586)
@@ -17085,6 +17164,28 @@ pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
))
}
+/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epi64&expand=4590)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
+ simd_reduce_min(a.as_i64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
+ simd_reduce_min(simd_select_bitmask(
+ k,
+ a.as_i64x8(),
+ _mm512_set1_epi64(0).as_i64x8(),
+ ))
+}
+
/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu32&expand=4592)
@@ -17107,6 +17208,28 @@ pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
))
}
+/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu64&expand=4594)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
+ simd_reduce_min(a.as_u64x8())
+}
+
+/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
+ simd_reduce_min(simd_select_bitmask(
+ k,
+ a.as_u64x8(),
+ _mm512_set1_epi64(0).as_u64x8(),
+ ))
+}
+
/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_ps&expand=4598)
@@ -17191,6 +17314,29 @@ pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
))
}
+/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_and_epi64&expand=4566)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
+ simd_reduce_and(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
+ simd_reduce_and(simd_select_bitmask(
+ k,
+ a.as_i64x8(),
+ _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7)
+ .as_i64x8(),
+ ))
+}
+
/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi32&expand=4608)
@@ -17213,6 +17359,28 @@ pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
))
}
+/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi64&expand=4610)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
+ simd_reduce_or(a.as_i64x8())
+}
+
+/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_or_epi64&expand=4609)
+#[inline]
+#[target_feature(enable = "avx512f")]
+pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
+ simd_reduce_or(simd_select_bitmask(
+ k,
+ a.as_i64x8(),
+ _mm512_setzero_si512().as_i64x8(),
+ ))
+}
+
/// Returns vector of type `__m512d` with undefined elements.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index f6f80a323c..92c300ca17 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -5814,6 +5814,13 @@ mod tests {
assert_eq!(8, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_add_epi64() {
+ let a = _mm512_set1_epi64(1);
+ let e: i64 = _mm512_mask_reduce_add_epi64(0b11110000, a);
+ assert_eq!(4, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_add_pd() {
let a = _mm512_set1_pd(1.);
@@ -5828,6 +5835,20 @@ mod tests {
assert_eq!(4., e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_mul_epi64() {
+ let a = _mm512_set1_epi64(2);
+ let e: i64 = _mm512_reduce_mul_epi64(a);
+ assert_eq!(256, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_mul_epi64() {
+ let a = _mm512_set1_epi64(2);
+ let e: i64 = _mm512_mask_reduce_mul_epi64(0b11110000, a);
+ assert_eq!(16, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_mul_pd() {
let a = _mm512_set1_pd(2.);
@@ -5842,6 +5863,34 @@ mod tests {
assert_eq!(16., e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_max_epi64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: i64 = _mm512_reduce_max_epi64(a);
+ assert_eq!(7, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_max_epi64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: i64 = _mm512_mask_reduce_max_epi64(0b11110000, a);
+ assert_eq!(3, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_max_epu64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: u64 = _mm512_reduce_max_epu64(a);
+ assert_eq!(7, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_max_epu64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: u64 = _mm512_mask_reduce_max_epu64(0b11110000, a);
+ assert_eq!(3, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_max_pd() {
let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -5856,6 +5905,34 @@ mod tests {
assert_eq!(3., e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_min_epi64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: i64 = _mm512_reduce_min_epi64(a);
+ assert_eq!(0, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_min_epi64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: i64 = _mm512_mask_reduce_min_epi64(0b11110000, a);
+ assert_eq!(0, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_min_epu64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: u64 = _mm512_reduce_min_epu64(a);
+ assert_eq!(0, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_min_epu64() {
+ let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+ let e: u64 = _mm512_mask_reduce_min_epu64(0b11110000, a);
+ assert_eq!(0, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_min_pd() {
let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
@@ -5870,6 +5947,34 @@ mod tests {
assert_eq!(0., e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_and_epi64() {
+ let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+ let e: i64 = _mm512_reduce_and_epi64(a);
+ assert_eq!(0, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_and_epi64() {
+ let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+ let e: i64 = _mm512_mask_reduce_and_epi64(0b11110000, a);
+ assert_eq!(1, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_reduce_or_epi64() {
+ let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+ let e: i64 = _mm512_reduce_or_epi64(a);
+ assert_eq!(3, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_reduce_or_epi64() {
+ let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2);
+ let e: i64 = _mm512_mask_reduce_or_epi64(0b11110000, a);
+ assert_eq!(1, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_extractf64x4_pd() {
let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index b9aba9461a..d5096a6903 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -577,7 +577,22 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
| "_mm512_setr4_epi64"
| "_mm512_set_epi64"
| "_mm512_setr_epi64"
- | "_mm512_reduce_add_epi64" => true,
+ | "_mm512_reduce_add_epi64"
+ | "_mm512_mask_reduce_add_epi64"
+ | "_mm512_reduce_mul_epi64"
+ | "_mm512_mask_reduce_mul_epi64"
+ | "_mm512_reduce_max_epi64"
+ | "_mm512_mask_reduce_max_epi64"
+ | "_mm512_reduce_max_epu64"
+ | "_mm512_mask_reduce_max_epu64"
+ | "_mm512_reduce_min_epi64"
+ | "_mm512_mask_reduce_min_epi64"
+ | "_mm512_reduce_min_epu64"
+ | "_mm512_mask_reduce_min_epu64"
+ | "_mm512_reduce_and_epi64"
+ | "_mm512_mask_reduce_and_epi64"
+ | "_mm512_reduce_or_epi64"
+ | "_mm512_mask_reduce_or_epi64" => true,
// These return a 64-bit argument but they're assembled from other
// 32-bit registers, so these work on 32-bit just fine. See #308 for
From 1f662e9c324a330b44b5af2080ef4037e890cc0b Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 17:41:25 +0000
Subject: [PATCH 05/25] fixupimm_round_ps
---
crates/core_arch/avx512f.md | 2 +-
crates/core_arch/src/x86/avx512f.rs | 176 ++++++++++++++++++++++++++++
2 files changed, 177 insertions(+), 1 deletion(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 4184997d23..742fd7bda1 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -162,7 +162,7 @@
* [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
* [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
* [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
- * [ ] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
+ * [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
* [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236)
* [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236)
* [x] [`_mm512_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 95c3794b09..38b3177e56 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -5500,6 +5500,95 @@ pub unsafe fn _mm512_maskz_scalef_round_pd(
transmute(r)
}
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_ps&expand=2505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_ps(
+ a: __m512,
+ b: __m512,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ c.as_i32x16(),
+ $imm8,
+ 0b11111111_11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_ps&expand=2506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_ps(
+ a: __m512,
+ k: __mmask16,
+ b: __m512,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmps(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_ps&expand=2507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_ps(
+ k: __mmask16,
+ a: __m512,
+ b: __m512,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ c.as_i32x16(),
+ $imm8,
+ 0b11111111_11111111,
+ $imm4,
+ )
+ };
+ }
+ let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call);
+ let zero = _mm512_setzero_ps().as_f32x16();
+ transmute(simd_select_bitmask(k, r, zero))
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -18038,6 +18127,11 @@ extern "C" {
#[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
+ #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
+ fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+ #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
+ fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+
#[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -21605,6 +21699,88 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_fixupimm_round_ps() {
+ let a = _mm512_set1_ps(f32::NAN);
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_fixupimm_round_ps(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_ps(0.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_fixupimm_round_ps() {
+ let a = _mm512_set_ps(
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ );
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_mask_fixupimm_round_ps(
+ a,
+ 0b11111111_00000000,
+ b,
+ c,
+ 5,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm512_set_ps(
+ 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_fixupimm_round_ps() {
+ let a = _mm512_set_ps(
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ );
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_maskz_fixupimm_round_ps(
+ 0b11111111_00000000,
+ a,
+ b,
+ c,
+ 5,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm512_set_ps(
+ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_ps() {
let a = _mm512_set1_ps(10.);
From 942481b419976f19dbb61ba9c86c2b2b742a1d88 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 20:19:48 +0000
Subject: [PATCH 06/25] fixupimm_round_pd
---
crates/core_arch/avx512f.md | 2 +-
crates/core_arch/src/x86/avx512f.rs | 93 +++++++++++++++++++++++---
crates/core_arch/src/x86_64/avx512f.rs | 20 ++++++
3 files changed, 106 insertions(+), 9 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 742fd7bda1..ab7519d643 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -778,7 +778,7 @@
* [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
* [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
* [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
- * [ ] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
+ * [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
* [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236)
* [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236)
* [x] [`_mm512_maskz_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 38b3177e56..a08845476d 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -5574,19 +5574,92 @@ pub unsafe fn _mm512_maskz_fixupimm_round_ps(
) -> __m512 {
macro_rules! call {
($imm8:expr, $imm4:expr) => {
- vfixupimmps(
- a.as_f32x16(),
- b.as_f32x16(),
- c.as_i32x16(),
+ vfixupimmpsz(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_pd&expand=2502)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm512_fixupimm_round_pd(
+ a: __m512d,
+ b: __m512d,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ c.as_i64x8(),
$imm8,
- 0b11111111_11111111,
+ 0b11111111,
$imm4,
)
};
}
- let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call);
- let zero = _mm512_setzero_ps().as_f32x16();
- transmute(simd_select_bitmask(k, r, zero))
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_pd&expand=2503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_mask_fixupimm_round_pd(
+ a: __m512d,
+ k: __mmask8,
+ b: __m512d,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmpd(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_pd&expand=2504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm512_maskz_fixupimm_round_pd(
+ k: __mmask8,
+ a: __m512d,
+ b: __m512d,
+ c: __m512i,
+ imm8: i32,
+ sae: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vfixupimmpdz(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
}
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
@@ -18131,6 +18204,10 @@ extern "C" {
fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+ #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
+ fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
+ #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
+ fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
#[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 92c300ca17..bb359f1e38 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1055,6 +1055,26 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_fixupimm_round_pd() {
+ let a = _mm512_set1_pd(f64::NAN);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(0.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_fixupimm_round_pd() {
+ let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_pd() {
let a = _mm512_set1_pd(10.);
From 66e2d9b59d18fd6fe8cb533bb9fc603f2a92114c Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 22:35:17 +0000
Subject: [PATCH 07/25] fixupimm: ps,pd
---
crates/core_arch/avx512f.md | 20 +--
crates/core_arch/src/x86/avx512f.rs | 236 +++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 48 ++++-
3 files changed, 290 insertions(+), 14 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ab7519d643..eafcd9291b 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -159,9 +159,9 @@
* [x] [`_mm512_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=5236)
* [x] [`_mm512_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=5236)
* [x] [`_mm512_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=5236)
- * [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
- * [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
- * [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
+ * [x] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236)
+ * [x] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236)
+ * [x] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236)
* [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236)
* [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236)
* [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236)
@@ -450,10 +450,10 @@
* [x] [`_mm512_mask_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf64x4_pd&expand=5236)
* [x] [`_mm512_mask_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=5236)
* [x] [`_mm512_mask_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=5236)
- * [ ] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236)
- * [ ] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236)
- * [ ] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236)
- * [ ] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236)
+ * [x] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236)
+ * [x] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236)
+ * [x] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236)
+ * [x] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236)
* [x] [`_mm512_mask_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pd&expand=5236)
* [x] [`_mm512_mask_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ps&expand=5236)
* [x] [`_mm512_mask_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pd&expand=5236)
@@ -775,9 +775,9 @@
* [x] [`_mm512_maskz_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf64x4_pd&expand=5236)
* [x] [`_mm512_maskz_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=5236)
* [x] [`_mm512_maskz_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=5236)
- * [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
- * [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
- * [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
+ * [x] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236)
+ * [x] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236)
+ * [x] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236)
* [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236)
* [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236)
* [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a08845476d..164630ff51 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2244,6 +2244,174 @@ pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m
))
}
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_ps&expand=2499)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_fixupimm_ps(a: __m512, b: __m512, c: __m512i, imm8: i32) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ c.as_i32x16(),
+ $imm8,
+ 0b11111111_11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_ps&expand=2500)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_fixupimm_ps(
+ a: __m512,
+ k: __mmask16,
+ b: __m512,
+ c: __m512i,
+ imm8: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmps(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ c.as_i32x16(),
+ $imm8,
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_ps&expand=2501)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_fixupimm_ps(
+ k: __mmask16,
+ a: __m512,
+ b: __m512,
+ c: __m512i,
+ imm8: i32,
+) -> __m512 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmpsz(
+ a.as_f32x16(),
+ b.as_f32x16(),
+ c.as_i32x16(),
+ $imm8,
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_pd&expand=2490)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_fixupimm_pd(a: __m512d, b: __m512d, c: __m512i, imm8: i32) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ c.as_i64x8(),
+ $imm8,
+ 0b11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_pd&expand=2491)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_fixupimm_pd(
+ a: __m512d,
+ k: __mmask8,
+ b: __m512d,
+ c: __m512i,
+ imm8: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmpd(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ c.as_i64x8(),
+ $imm8,
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_pd&expand=2492)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_fixupimm_pd(
+ k: __mmask8,
+ a: __m512d,
+ b: __m512d,
+ c: __m512i,
+ imm8: i32,
+) -> __m512d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vfixupimmpdz(
+ a.as_f64x8(),
+ b.as_f64x8(),
+ c.as_i64x8(),
+ $imm8,
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -20342,6 +20510,74 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_fixupimm_ps() {
+ let a = _mm512_set1_ps(f32::NAN);
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_fixupimm_ps(a, b, c, 5);
+ let e = _mm512_set1_ps(0.0);
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_fixupimm_ps() {
+ let a = _mm512_set_ps(
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ );
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_mask_fixupimm_ps(a, 0b11111111_00000000, b, c, 5);
+ let e = _mm512_set_ps(
+ 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
+ );
+ assert_eq_m512(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_fixupimm_ps() {
+ let a = _mm512_set_ps(
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ f32::NAN,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ 1.,
+ );
+ let b = _mm512_set1_ps(f32::MAX);
+ let c = _mm512_set1_epi32(i32::MAX);
+ let r = _mm512_maskz_fixupimm_ps(0b11111111_00000000, a, b, c, 5);
+ let e = _mm512_set_ps(
+ 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
+ );
+ assert_eq_m512(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_ps() {
let a = _mm512_set1_ps(10.);
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index bb359f1e38..734470a9bb 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1056,25 +1056,35 @@ mod tests {
}
#[simd_test(enable = "avx512f")]
- unsafe fn test_mm512_fixupimm_round_pd() {
+ unsafe fn test_mm512_fixupimm_pd() {
let a = _mm512_set1_pd(f64::NAN);
let b = _mm512_set1_pd(f64::MAX);
let c = _mm512_set1_epi64(i32::MAX as i64);
- let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let r = _mm512_fixupimm_pd(a, b, c, 5);
let e = _mm512_set1_pd(0.0);
assert_eq_m512d(r, e);
}
#[simd_test(enable = "avx512f")]
- unsafe fn test_mm512_mask_fixupimm_round_pd() {
+ unsafe fn test_mm512_mask_fixupimm_pd() {
let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
let b = _mm512_set1_pd(f64::MAX);
let c = _mm512_set1_epi64(i32::MAX as i64);
- let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let r = _mm512_mask_fixupimm_pd(a, 0b11110000, b, c, 5);
let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_fixupimm_pd() {
+ let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_maskz_fixupimm_pd(0b11110000, a, b, c, 5);
+ let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_pd() {
let a = _mm512_set1_pd(10.);
@@ -2839,6 +2849,36 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_fixupimm_round_pd() {
+ let a = _mm512_set1_pd(f64::NAN);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set1_pd(0.0);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_fixupimm_round_pd() {
+ let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.);
+ assert_eq_m512d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_fixupimm_round_pd() {
+ let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.);
+ let b = _mm512_set1_pd(f64::MAX);
+ let c = _mm512_set1_epi64(i32::MAX as i64);
+ let r = _mm512_maskz_fixupimm_round_pd(0b11110000, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.);
+ assert_eq_m512d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_round_pd() {
let a = _mm512_set1_pd(10.);
From aa32a1f233dbba9b789388ce1f1cfa2ae602a427 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Sun, 11 Oct 2020 23:51:44 +0000
Subject: [PATCH 08/25] ternarylogic_epi32
---
crates/core_arch/avx512f.md | 6 +-
crates/core_arch/src/x86/avx512f.rs | 103 ++++++++++++++++++++++++++++
2 files changed, 106 insertions(+), 3 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index eafcd9291b..055e3df356 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -662,7 +662,7 @@
* [x] [`_mm512_mask_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ps&expand=5236)
* [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236)
* [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
- * [ ] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
+ * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
* [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
* [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
* [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
@@ -926,7 +926,7 @@
* [x] [`_mm512_maskz_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ps&expand=5236)
* [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236)
* [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236)
- * [ ] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
+ * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
* [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
* [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
* [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
@@ -1106,7 +1106,7 @@
* [x] [`_mm512_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_pd&expand=5236)
* [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236)
* [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
- * [ ] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
+ * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
* [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
* [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
* [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 164630ff51..080de6a1cc 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2412,6 +2412,70 @@ pub unsafe fn _mm512_maskz_fixupimm_pd(
transmute(r)
}
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi32&expand=5867)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_ternarylogic_epi32(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8)
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi32&expand=5865)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi32(
+ src: __m512i,
+ k: __mmask16,
+ a: __m512i,
+ b: __m512i,
+ imm8: i32,
+) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16(), $imm8)
+ };
+ }
+ let ternarylogic = constify_imm8_sae!(imm8, call);
+ transmute(simd_select_bitmask(k, ternarylogic, src.as_i32x16()))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi32&expand=5866)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi32(
+ k: __mmask16,
+ a: __m512i,
+ b: __m512i,
+ c: __m512i,
+ imm8: i32,
+) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8)
+ };
+ }
+ let ternarylogic = constify_imm8_sae!(imm8, call);
+ let zero = _mm512_setzero_si512().as_i32x16();
+ transmute(simd_select_bitmask(k, ternarylogic, zero))
+}
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -18377,6 +18441,11 @@ extern "C" {
#[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
+ #[link_name = "llvm.x86.avx512.pternlog.d.512"]
+ fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, sae: i32) -> i32x16;
+ #[link_name = "llvm.x86.avx512.pternlog.q.512"]
+ fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, sae: i32) -> i64x8;
+
#[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
@@ -20578,6 +20647,40 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_ternarylogic_epi32() {
+ let a = _mm512_set1_epi32(1 << 2);
+ let b = _mm512_set1_epi32(1 << 1);
+ let c = _mm512_set1_epi32(1 << 0);
+ let r = _mm512_ternarylogic_epi32(a, b, c, 8);
+ let e = _mm512_set1_epi32(0);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_ternarylogic_epi32() {
+ let src = _mm512_set1_epi32(1 << 2);
+ let a = _mm512_set1_epi32(1 << 1);
+ let b = _mm512_set1_epi32(1 << 0);
+ let r = _mm512_mask_ternarylogic_epi32(src, 0, a, b, 8);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_ternarylogic_epi32(src, 0b11111111_11111111, a, b, 8);
+ let e = _mm512_set1_epi32(0);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_ternarylogic_epi32() {
+ let a = _mm512_set1_epi32(1 << 2);
+ let b = _mm512_set1_epi32(1 << 1);
+ let c = _mm512_set1_epi32(1 << 0);
+ let r = _mm512_maskz_ternarylogic_epi32(0, a, b, c, 9);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_ternarylogic_epi32(0b11111111_11111111, a, b, c, 8);
+ let e = _mm512_set1_epi32(0);
+ assert_eq_m512i(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_ps() {
let a = _mm512_set1_ps(10.);
From a25eaf3022bf1f9283a44a7a2c5b5caaf2c31680 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Mon, 12 Oct 2020 00:03:59 +0000
Subject: [PATCH 09/25] ternarylogic_epi64
---
crates/core_arch/avx512f.md | 6 +--
crates/core_arch/src/x86/avx512f.rs | 66 ++++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 34 +++++++++++++
3 files changed, 103 insertions(+), 3 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 055e3df356..443adee4b9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -663,7 +663,7 @@
* [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236)
* [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
* [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
- * [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
+ * [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
* [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
* [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
* [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
@@ -927,7 +927,7 @@
* [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236)
* [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236)
* [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236)
- * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
+ * [x] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236)
* [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236)
* [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236)
* [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236)
@@ -1107,7 +1107,7 @@
* [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236)
* [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
* [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
- * [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
+ * [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
* [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
* [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
* [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 080de6a1cc..c7156b6d4c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2476,6 +2476,72 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi32(
transmute(simd_select_bitmask(k, ternarylogic, zero))
}
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi64&expand=5876)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm512_ternarylogic_epi64(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8)
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi64&expand=5874)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_mask_ternarylogic_epi64(
+ src: __m512i,
+ k: __mmask8,
+ a: __m512i,
+ b: __m512i,
+ imm8: i32,
+) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogq(src.as_i64x8(), a.as_i64x8(), b.as_i64x8(), $imm8)
+ };
+ }
+ let ternarylogic = constify_imm8_sae!(imm8, call);
+ transmute(simd_select_bitmask(k, ternarylogic, src.as_i64x8()))
+}
+
+/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi64&expand=5875)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm512_maskz_ternarylogic_epi64(
+ k: __mmask8,
+ a: __m512i,
+ b: __m512i,
+ c: __m512i,
+ imm8: i32,
+) -> __m512i {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8)
+ };
+ }
+ let ternarylogic = constify_imm8_sae!(imm8, call);
+ let zero = _mm512_setzero_si512().as_i64x8();
+ transmute(simd_select_bitmask(k, ternarylogic, zero))
+}
+
+/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 734470a9bb..3f7d67dc04 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -1085,6 +1085,40 @@ mod tests {
assert_eq_m512d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_ternarylogic_epi64() {
+ let a = _mm512_set1_epi64(1 << 2);
+ let b = _mm512_set1_epi64(1 << 1);
+ let c = _mm512_set1_epi64(1 << 0);
+ let r = _mm512_ternarylogic_epi64(a, b, c, 8);
+ let e = _mm512_set1_epi64(0);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_ternarylogic_epi64() {
+ let src = _mm512_set1_epi64(1 << 2);
+ let a = _mm512_set1_epi64(1 << 1);
+ let b = _mm512_set1_epi64(1 << 0);
+ let r = _mm512_mask_ternarylogic_epi64(src, 0, a, b, 8);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_ternarylogic_epi64(src, 0b11111111, a, b, 8);
+ let e = _mm512_set1_epi64(0);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_ternarylogic_epi64() {
+ let a = _mm512_set1_epi64(1 << 2);
+ let b = _mm512_set1_epi64(1 << 1);
+ let c = _mm512_set1_epi64(1 << 0);
+ let r = _mm512_maskz_ternarylogic_epi64(0, a, b, c, 9);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_ternarylogic_epi64(0b11111111, a, b, c, 8);
+ let e = _mm512_set1_epi64(0);
+ assert_eq_m512i(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_getmant_pd() {
let a = _mm512_set1_pd(10.);
From 7624f5bfa772518bcf4ab8edd2e0fe236a4d4765 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Mon, 12 Oct 2020 21:16:37 +0000
Subject: [PATCH 10/25] int2mask, mask2int, stream: ps,pd,si512
---
crates/core_arch/avx512f.md | 10 +--
crates/core_arch/src/x86/avx512f.rs | 86 ++++++++++++++++++++++++++
crates/core_arch/src/x86/test.rs | 15 +++++
crates/core_arch/src/x86_64/avx512f.rs | 30 +++++++++
4 files changed, 136 insertions(+), 5 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 443adee4b9..184fb92aa8 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -226,7 +226,7 @@
* [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236)
* [x] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236)
* [x] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236)
- * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
+ * [x] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236)
* [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236)
* [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236)
* [x] [`_mm512_kmov`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kmov&expand=5236)
@@ -251,7 +251,7 @@
* [x] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236)
* [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236)
* [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236)
- * [ ] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236)
+ * [x] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236)
* [x] [`_mm512_mask3_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=5236)
* [x] [`_mm512_mask3_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=5236)
* [x] [`_mm512_mask3_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pd&expand=5236)
@@ -1096,9 +1096,9 @@
* [x] [`_mm512_storeu_epi64`]
* [x] [`_mm512_storeu_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_si512&expand=5236)
* [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512&expand=5236)
- * [ ] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236)
- * [ ] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236)
- * [ ] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236)
+ * [x] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236)
+ * [x] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236)
+ * [x] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236)
* [x] [`_mm512_sub_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi32&expand=5236)
* [x] [`_mm512_sub_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi64&expand=5236)
* [x] [`_mm512_sub_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_pd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c7156b6d4c..c5bfc267db 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15812,6 +15812,61 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
transmute(r)
}
+/// Converts integer mask into bitmask, storing the result in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
+ assert!(mask >= 0);
+ let r: u16 = mask as u16;
+ transmute(r)
+}
+
+/// Converts bit mask k1 into an integer value, storing the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
+#[inline]
+#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
+pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
+ let r: i32 = k1 as i32;
+ transmute(r)
+}
+
+/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))]
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
+ intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
+}
+
+/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_pd&expand=5667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
+ intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
+}
+
+/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_si512&expand=5675)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
+#[allow(clippy::cast_ptr_alignment)]
+pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
+ intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
+}
+
/// Sets packed 32-bit integers in `dst` with the supplied values.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
@@ -27213,6 +27268,37 @@ mod tests {
assert_eq!(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_int2mask() {
+ let a: i32 = 0b11001100_00110011;
+ let r = _mm512_int2mask(a);
+ let e: u16 = 0b11001100_00110011;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask2int() {
+ let k1: __mmask16 = 0b11001100_00110011;
+ let r = _mm512_mask2int(k1);
+ let e: i32 = 0b11001100_00110011;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_stream_ps() {
+ #[repr(align(32))]
+ struct Memory {
+ pub data: [f32; 16],
+ }
+ let a = _mm512_set1_ps(7.0);
+ let mut mem = Memory { data: [-1.0; 16] };
+
+ _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
+ for i in 0..16 {
+ assert_eq!(mem.data[i], get_m512(a, i));
+ }
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_reduce_add_epi32() {
let a = _mm512_set1_epi32(1);
diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs
index 594f680b73..9014d66fd0 100644
--- a/crates/core_arch/src/x86/test.rs
+++ b/crates/core_arch/src/x86/test.rs
@@ -71,6 +71,21 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 {
transmute::<_, [f32; 8]>(a)[idx]
}
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 {
+ transmute::<_, [f32; 16]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 {
+ transmute::<_, [f64; 8]>(a)[idx]
+}
+
+#[target_feature(enable = "avx512f")]
+pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 {
+ transmute::<_, [i64; 8]>(a)[idx]
+}
+
// These intrinsics doesn't exist on x86 b/c it requires a 64-bit register,
// which doesn't exist on x86!
#[cfg(target_arch = "x86")]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3f7d67dc04..3d5b22c64b 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6257,4 +6257,34 @@ mod tests {
_mm512_store_pd(&mut r as *mut _ as *mut f64, a);
assert_eq_m512d(r, a);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_stream_pd() {
+ #[repr(align(64))]
+ struct Memory {
+ pub data: [f64; 8],
+ }
+ let a = _mm512_set1_pd(7.0);
+ let mut mem = Memory { data: [-1.0; 8] };
+
+ _mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
+ for i in 0..8 {
+ assert_eq!(mem.data[i], get_m512d(a, i));
+ }
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_stream_si512() {
+ #[repr(align(64))]
+ struct Memory {
+ pub data: [i64; 8],
+ }
+ let a = _mm512_set1_epi64(7);
+ let mut mem = Memory { data: [-1; 8] };
+
+ _mm512_stream_si512(&mut mem.data[0] as *mut i64, a);
+ for i in 0..8 {
+ assert_eq!(mem.data[i], get_m512i(a, i));
+ }
+ }
}
From c0051783d8bb30d5f301d1d6fc2f694a91ff113f Mon Sep 17 00:00:00 2001
From: jironglin
Date: Mon, 12 Oct 2020 22:03:55 +0000
Subject: [PATCH 11/25] mask_set1: epi32,epi64, maskz_set1: epi32,epi64
---
crates/core_arch/avx512f.md | 6 +--
crates/core_arch/src/x86/avx512f.rs | 67 ++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 21 ++++++++
crates/stdarch-verify/tests/x86-intel.rs | 4 +-
4 files changed, 94 insertions(+), 4 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 184fb92aa8..2534fb7a5f 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -617,7 +617,7 @@
* [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236)
* [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
* [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
- * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
+ * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
* [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
* [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
* [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236)
@@ -889,8 +889,8 @@
* [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236)
* [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236)
* [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236)
- * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
- * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
+ * [x] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236)
+ * [x] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236)
* [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236)
* [x] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236)
* [x] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index c5bfc267db..18b0568f08 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15990,6 +15990,29 @@ pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
transmute(i32x16::splat(a))
}
+/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi32&expand=4951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
+ let r = _mm512_set1_epi32(a).as_i32x16();
+ transmute(simd_select_bitmask(k, r, src.as_i32x16()))
+}
+
+/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi32&expand=4952)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastd))]
+pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
+ let r = _mm512_set1_epi32(a).as_i32x16();
+ let zero = _mm512_setzero_si512().as_i32x16();
+ transmute(simd_select_bitmask(k, r, zero))
+}
+
/// Broadcast 64-bit integer `a` to all elements of `dst`.
#[inline]
#[target_feature(enable = "avx512f")]
@@ -15997,6 +16020,29 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
transmute(i64x8::splat(a))
}
+/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi64&expand=4959)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
+ let r = _mm512_set1_epi64(a).as_i64x8();
+ transmute(simd_select_bitmask(k, r, src.as_i64x8()))
+}
+
+/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi64&expand=4960)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vpbroadcastq))]
+pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
+ let r = _mm512_set1_epi64(a).as_i64x8();
+ let zero = _mm512_setzero_si512().as_i64x8();
+ transmute(simd_select_bitmask(k, r, zero))
+}
+
/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_epi64&expand=4983)
@@ -27669,4 +27715,25 @@ mod tests {
_mm512_store_ps(&mut r as *mut _ as *mut f32, a);
assert_eq_m512(r, a);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_set1_epi32() {
+ let src = _mm512_set1_epi32(2);
+ let a: i32 = 11;
+ let r = _mm512_mask_set1_epi32(src, 0, a);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
+ let e = _mm512_set1_epi32(11);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_set1_epi32() {
+ let a: i32 = 11;
+ let r = _mm512_maskz_set1_epi32(0, a);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
+ let e = _mm512_set1_epi32(11);
+ assert_eq_m512i(r, e);
+ }
}
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3d5b22c64b..3dbbc77a89 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6287,4 +6287,25 @@ mod tests {
assert_eq!(mem.data[i], get_m512i(a, i));
}
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_set1_epi64() {
+ let src = _mm512_set1_epi64(2);
+ let a: i64 = 11;
+ let r = _mm512_mask_set1_epi64(src, 0, a);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_set1_epi64(src, 0b11111111, a);
+ let e = _mm512_set1_epi64(11);
+ assert_eq_m512i(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_set1_epi64() {
+ let a: i64 = 11;
+ let r = _mm512_maskz_set1_epi64(0, a);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_set1_epi64(0b11111111, a);
+ let e = _mm512_set1_epi64(11);
+ assert_eq_m512i(r, e);
+ }
}
diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs
index d5096a6903..0e44f73a7f 100644
--- a/crates/stdarch-verify/tests/x86-intel.rs
+++ b/crates/stdarch-verify/tests/x86-intel.rs
@@ -592,7 +592,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> {
| "_mm512_reduce_and_epi64"
| "_mm512_mask_reduce_and_epi64"
| "_mm512_reduce_or_epi64"
- | "_mm512_mask_reduce_or_epi64" => true,
+ | "_mm512_mask_reduce_or_epi64"
+ | "_mm512_mask_set1_epi64"
+ | "_mm512_maskz_set1_epi64" => true,
// These return a 64-bit argument but they're assembled from other
// 32-bit registers, so these work on 32-bit just fine. See #308 for
From 4091be50ab71f34ae4f44813f40ecddf9e9315a2 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Mon, 12 Oct 2020 23:03:04 +0000
Subject: [PATCH 12/25] test_epi32_mask, test_epi64_mask, testn_epi32_mask,
testn_epi64_mask
---
crates/core_arch/avx512f.md | 16 +--
crates/core_arch/src/x86/avx512f.rs | 138 ++++++++++++++++++++++++-
crates/core_arch/src/x86_64/avx512f.rs | 40 +++++++
3 files changed, 185 insertions(+), 9 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 2534fb7a5f..43b2597510 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -664,10 +664,10 @@
* [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236)
* [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236)
* [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236)
- * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
- * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
- * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
- * [ ] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
+ * [x] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236)
+ * [x] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236)
+ * [x] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236)
+ * [x] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236)
* [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236)
* [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236)
* [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236)
@@ -1108,10 +1108,10 @@
* [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236)
* [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236)
* [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236)
- * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
- * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
- * [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
- * [ ] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236)
+ * [x] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236)
+ * [x] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236)
+ * [x] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236)
+ * [x] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236)
* [x] [`_mm512_undefined_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_epi32&expand=5236)
* [x] [`_mm512_undefined_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd&expand=5236)
* [x] [`_mm512_undefined_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 18b0568f08..1ef726fba5 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -15827,13 +15827,109 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
#[inline]
-#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
+#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
let r: i32 = k1 as i32;
transmute(r)
}
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi32_mask&expand=5890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+ let and = _mm512_and_epi32(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_cmpneq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi32_mask&expand=5889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmd))]
+pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+ let and = _mm512_and_epi32(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_mask_cmpneq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi64_mask&expand=5896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+ let and = _mm512_and_epi64(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_cmpneq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi64_mask&expand=5895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestmq))]
+pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+ let and = _mm512_and_epi64(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_mask_cmpneq_epi64_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi32_mask&expand=5921)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
+ let and = _mm512_and_epi32(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_cmpeq_epi32_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi32_mask&expand=5920)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmd))]
+pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
+ let and = _mm512_and_epi32(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_mask_cmpeq_epi32_mask(k, and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi64_mask&expand=5927)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
+ let and = _mm512_and_epi64(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_cmpeq_epi64_mask(and, zero)
+}
+
+/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi64_mask&expand=5926)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vptestnmq))]
+pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
+ let and = _mm512_and_epi64(a, b);
+ let zero = _mm512_setzero_si512();
+ _mm512_mask_cmpeq_epi64_mask(k, and, zero)
+}
+
/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671)
@@ -27330,6 +27426,46 @@ mod tests {
assert_eq!(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_test_epi32_mask() {
+ let a = _mm512_set1_epi32(1 << 0);
+ let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+ let r = _mm512_test_epi32_mask(a, b);
+ let e: __mmask16 = 0b11111111_11111111;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_test_epi32_mask() {
+ let a = _mm512_set1_epi32(1 << 0);
+ let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+ let r = _mm512_mask_test_epi32_mask(0, a, b);
+ assert_eq!(r, 0);
+ let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
+ let e: __mmask16 = 0b11111111_11111111;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_testn_epi32_mask() {
+ let a = _mm512_set1_epi32(1 << 0);
+ let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
+ let r = _mm512_testn_epi32_mask(a, b);
+ let e: __mmask16 = 0b00000000_00000000;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_testn_epi32_mask() {
+ let a = _mm512_set1_epi32(1 << 0);
+ let b = _mm512_set1_epi32(1 << 1);
+ let r = _mm512_mask_test_epi32_mask(0, a, b);
+ assert_eq!(r, 0);
+ let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
+ let e: __mmask16 = 0b11111111_11111111;
+ assert_eq!(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_stream_ps() {
#[repr(align(32))]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 3dbbc77a89..758a2b563c 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -6258,6 +6258,46 @@ mod tests {
assert_eq_m512d(r, a);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_test_epi64_mask() {
+ let a = _mm512_set1_epi64(1 << 0);
+ let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+ let r = _mm512_test_epi64_mask(a, b);
+ let e: __mmask8 = 0b11111111;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_test_epi64_mask() {
+ let a = _mm512_set1_epi64(1 << 0);
+ let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+ let r = _mm512_mask_test_epi64_mask(0, a, b);
+ assert_eq!(r, 0);
+ let r = _mm512_mask_test_epi64_mask(0b11111111, a, b);
+ let e: __mmask8 = 0b11111111;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_testn_epi64_mask() {
+ let a = _mm512_set1_epi64(1 << 0);
+ let b = _mm512_set1_epi64(1 << 0 | 1 << 1);
+ let r = _mm512_testn_epi64_mask(a, b);
+ let e: __mmask8 = 0b00000000;
+ assert_eq!(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_testn_epi64_mask() {
+ let a = _mm512_set1_epi64(1 << 0);
+ let b = _mm512_set1_epi64(1 << 1);
+ let r = _mm512_mask_testn_epi64_mask(0, a, b);
+ assert_eq!(r, 0);
+ let r = _mm512_mask_testn_epi64_mask(0b11111111, a, b);
+ let e: __mmask8 = 0b11111111;
+ assert_eq!(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_stream_pd() {
#[repr(align(64))]
From 71764a1817bee74ca790cf5257d431f3dcee52e4 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Mon, 12 Oct 2020 23:37:07 +0000
Subject: [PATCH 13/25] mask_mov: epi32,epi64,ps,pd; maskz_mov:
epi32,epi64,ps,pd
---
crates/core_arch/avx512f.md | 18 ++--
crates/core_arch/src/x86/avx512f.rs | 130 +++++++++++++++++++++++++
crates/core_arch/src/x86_64/avx512f.rs | 38 ++++++++
3 files changed, 177 insertions(+), 9 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 43b2597510..57b3266a0d 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -541,10 +541,10 @@
* [x] [`_mm512_mask_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ps&expand=5236)
* [x] [`_mm512_mask_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_pd&expand=5236)
* [x] [`_mm512_mask_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ps&expand=5236)
- * [ ] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236)
- * [ ] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236)
- * [ ] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236)
- * [ ] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236)
+ * [x] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236)
+ * [x] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236)
+ * [x] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236)
+ * [x] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236)
* [x] [`_mm512_mask_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=5236)
* [x] [`_mm512_mask_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup_ps&expand=5236)
* [x] [`_mm512_mask_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=5236)
@@ -618,7 +618,7 @@
* [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236)
* [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236)
* [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236)
- * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
+ * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236)
* [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236)
* [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236)
* [x] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236)
@@ -839,10 +839,10 @@
* [x] [`_mm512_maskz_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ps&expand=5236)
* [x] [`_mm512_maskz_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_pd&expand=5236)
* [x] [`_mm512_maskz_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ps&expand=5236)
- * [ ] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236)
- * [ ] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236)
- * [ ] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236)
- * [ ] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236)
+ * [x] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236)
+ * [x] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236)
+ * [x] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236)
+ * [x] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236)
* [x] [`_mm512_maskz_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=5236)
* [x] [`_mm512_maskz_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movehdup_ps&expand=5236)
* [x] [`_mm512_maskz_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 1ef726fba5..2ebfcca5b9 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -135,6 +135,98 @@ pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m5
transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
}
+/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi32&expand=3801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
+ let mov = a.as_i32x16();
+ transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
+}
+
+/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi32&expand=3802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa32))]
+pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
+ let mov = a.as_i32x16();
+ let zero = _mm512_setzero_si512().as_i32x16();
+ transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi64&expand=3807)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
+ let mov = a.as_i64x8();
+ transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
+}
+
+/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi64&expand=3808)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovdqa64))]
+pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
+ let mov = a.as_i64x8();
+ let zero = _mm512_setzero_si512().as_i64x8();
+ transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_ps&expand=3825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
+ let mov = a.as_f32x16();
+ transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
+}
+
+/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_ps&expand=3826)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovaps))]
+pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
+ let mov = a.as_f32x16();
+ let zero = _mm512_setzero_ps().as_f32x16();
+ transmute(simd_select_bitmask(k, mov, zero))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_pd&expand=3819)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
+ let mov = a.as_f64x8();
+ transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
+}
+
+/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_pd&expand=3820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovapd))]
+pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
+ let mov = a.as_f64x8();
+ let zero = _mm512_setzero_pd().as_f64x8();
+ transmute(simd_select_bitmask(k, mov, zero))
+}
+
/// Add packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_epi32&expand=100)
@@ -19110,6 +19202,44 @@ mod tests {
assert_eq_m512(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_mov_epi32() {
+ let src = _mm512_set1_epi32(1);
+ let a = _mm512_set1_epi32(2);
+ let r = _mm512_mask_mov_epi32(src, 0, a);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
+ assert_eq_m512i(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_mov_epi32() {
+ let a = _mm512_set1_epi32(2);
+ let r = _mm512_maskz_mov_epi32(0, a);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
+ assert_eq_m512i(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_mov_ps() {
+ let src = _mm512_set1_ps(1.);
+ let a = _mm512_set1_ps(2.);
+ let r = _mm512_mask_mov_ps(src, 0, a);
+ assert_eq_m512(r, src);
+ let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
+ assert_eq_m512(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_mov_ps() {
+ let a = _mm512_set1_ps(2.);
+ let r = _mm512_maskz_mov_ps(0, a);
+ assert_eq_m512(r, _mm512_setzero_ps());
+ let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
+ assert_eq_m512(r, a);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_add_epi32() {
let a = _mm512_setr_epi32(
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index 758a2b563c..1b7c4ee088 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -68,6 +68,44 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_mov_epi64() {
+ let src = _mm512_set1_epi64(1);
+ let a = _mm512_set1_epi64(2);
+ let r = _mm512_mask_mov_epi64(src, 0, a);
+ assert_eq_m512i(r, src);
+ let r = _mm512_mask_mov_epi64(src, 0b11111111, a);
+ assert_eq_m512i(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_mov_epi64() {
+ let a = _mm512_set1_epi64(2);
+ let r = _mm512_maskz_mov_epi64(0, a);
+ assert_eq_m512i(r, _mm512_setzero_si512());
+ let r = _mm512_maskz_mov_epi64(0b11111111, a);
+ assert_eq_m512i(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_mask_mov_pd() {
+ let src = _mm512_set1_pd(1.);
+ let a = _mm512_set1_pd(2.);
+ let r = _mm512_mask_mov_pd(src, 0, a);
+ assert_eq_m512d(r, src);
+ let r = _mm512_mask_mov_pd(src, 0b11111111, a);
+ assert_eq_m512d(r, a);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm512_maskz_mov_pd() {
+ let a = _mm512_set1_pd(2.);
+ let r = _mm512_maskz_mov_pd(0, a);
+ assert_eq_m512d(r, _mm512_setzero_pd());
+ let r = _mm512_maskz_mov_pd(0b11111111, a);
+ assert_eq_m512d(r, a);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm512_mask_add_epi64() {
let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32);
From be334bb202593eaf848600ee7fb49ab715ce6cd2 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 13:31:26 +0000
Subject: [PATCH 14/25] mask_add: ss,sd
---
crates/core_arch/avx512f.md | 8 +-
crates/core_arch/src/x86/avx512f.rs | 120 ++
examples/connect5_14_avx512.rs | 2100 +++++++++++++++++++++++++++
3 files changed, 2224 insertions(+), 4 deletions(-)
create mode 100644 examples/connect5_14_avx512.rs
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 57b3266a0d..01328a43f8 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1240,8 +1240,8 @@
* [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236)
* [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
* [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
- * [ ] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
- * [ ] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
+ * [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
+ * [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
* [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236)
* [x] [`_mm_mask_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236)
* [x] [`_mm_mask_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236)
@@ -1322,8 +1322,8 @@
* [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
* [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
* [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
- * [ ] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
- * [ ] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
+ * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
+ * [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
* [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236)
* [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236)
* [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 2ebfcca5b9..7804a8ce44 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18417,6 +18417,76 @@ pub unsafe fn _mm512_set_pd(
_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
}
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let extractsrc: f32 = simd_extract(src, 0);
+ let mut add: f32 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta + extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss))]
+pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let mut add: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta + extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let extractsrc: f64 = simd_extract(src, 0);
+ let mut add: f64 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta + extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd))]
+pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let mut add: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta + extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
@@ -28002,4 +28072,54 @@ mod tests {
let e = _mm512_set1_epi32(11);
assert_eq_m512i(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_add_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_add_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_add_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 60.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_add_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_add_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_add_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 60.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_add_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_add_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_add_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 6.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_add_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_add_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_add_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 6.);
+ assert_eq_m128d(r, e);
+ }
}
diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs
new file mode 100644
index 0000000000..696adfc70c
--- /dev/null
+++ b/examples/connect5_14_avx512.rs
@@ -0,0 +1,2100 @@
+#![feature(stdsimd, avx512_target_feature)]
+
+#[cfg(target_arch = "x86")]
+use {core_arch::arch::x86::*};
+#[cfg(target_arch = "x86_64")]
+use {core_arch::arch::x86_64::*};
+
+
+use rand::seq::SliceRandom;
+use rand::thread_rng;
+use rand::Rng;
+
+use std::cmp;
+
+use std::time::{Duration, Instant};
+
+// types
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum Color {
+ Black,
+ White,
+ Empty,
+}
+
+type Square = i32;
+type Move = i32;
+type Side = Color;
+type Piece = Color;
+
+// constants
+
+const FILE_SIZE: i32 = 15;
+const RANK_SIZE: i32 = 15;
+const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4;
+
+const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100);
+const MOVE_NONE: Move = -1;
+const SCORE_NONE: i32 = -EVAL_INF - 1;
+
+const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1],
+ [-3, -2, -1, 1],
+ [-2, -1, 1, 2],
+ [-1, 1, 2, 3],
+ [ 1, 2, 3, 4],
+
+ [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)],
+ [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)],
+ [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)],
+ [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)],
+ [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)],
+
+ [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)],
+ [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)],
+ [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)],
+ [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)],
+ [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)],
+
+ [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)],
+ [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)],
+ [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)],
+ [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)],
+ [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)]
+ ];
+
+const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
+const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)];
+const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)];
+const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)];
+
+const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
+ 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30,
+ 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29,
+ 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28,
+ 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
+ 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
+ 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
+ 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
+ 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
+ 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
+ 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
+ 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20,
+ 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19,
+ 0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18,
+ 0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0, 0,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
+ 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
+ 0, 0, 0, 0, 0, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20,
+ 0, 0, 0, 0, 0, 0, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19,
+ 0, 0, 0, 0, 0, 0, 0, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
+ 0, 0, 0, 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31,
+ 0, 0, 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
+ 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0,
+ 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0, 0,
+ 0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0, 0, 0,
+ 0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0, 0, 0, 0]
+ ];
+
+const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0,
+ 0, 0, 0, 0, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0,
+ 0, 0, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0,
+ 0, 0, 0, 0, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ 0, 0, 0, 0, 15, 14, 13, 6, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
+ 0, 0, 0, 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+ 0, 0, 0, 0, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3,
+ 0, 0, 0, 0, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
+ 0, 0, 0, 0, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
+ 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
+ 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
+ 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
+ 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9
+ 0, 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10],
+
+ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 0, 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0,
+ 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0,
+ 0, 0, 0, 0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0,
+ 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0]
+ ];
+
+// variables
+
+static mut Endgame: bool = false;
+
+// structures
+
+pub struct Pos { // position
+ state: [Color; SQUARE_SIZE as usize],
+ p_turn: Side,
+ p_last: Move,
+
+ bitboard: [[[i32; 20]; 4]; 3],
+
+}
+
+impl Pos {
+
+ pub fn init(&mut self) { // starting position
+ for i in 0..SQUARE_SIZE as usize {
+ self.state[i] = Color::Empty;
+ }
+
+ self.p_turn = Color::Black;
+ self.p_last = square_make(0, 0);
+
+ //--------------------------------------------
+
+ for i in 0..4 {
+ for j in 0..20 {
+ self.bitboard[Color::Black as usize][i][j] = 0;
+ }
+ }
+
+ for i in 0..4 {
+ for j in 0..20 {
+ self.bitboard[Color::White as usize][i][j] = 0;
+ }
+ }
+
+ for i in 0..2 {
+ for j in 0..20 {
+ self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+ }
+ }
+
+ self.bitboard[Color::Empty as usize][2][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+ self.bitboard[Color::Empty as usize][2][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+ self.bitboard[Color::Empty as usize][2][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+ self.bitboard[Color::Empty as usize][2][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+ self.bitboard[Color::Empty as usize][2][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+ self.bitboard[Color::Empty as usize][2][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+ self.bitboard[Color::Empty as usize][2][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+ self.bitboard[Color::Empty as usize][2][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+ self.bitboard[Color::Empty as usize][2][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
+ self.bitboard[Color::Empty as usize][2][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+ self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+ self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+ self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+ self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+ self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+ self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+ self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+ self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+ self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+ self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+
+ self.bitboard[Color::Empty as usize][3][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+ self.bitboard[Color::Empty as usize][3][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+ self.bitboard[Color::Empty as usize][3][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+ self.bitboard[Color::Empty as usize][3][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+ self.bitboard[Color::Empty as usize][3][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+ self.bitboard[Color::Empty as usize][3][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+ self.bitboard[Color::Empty as usize][3][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+ self.bitboard[Color::Empty as usize][3][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+ self.bitboard[Color::Empty as usize][3][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
+ self.bitboard[Color::Empty as usize][3][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+ self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
+ self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
+ self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
+ self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
+ self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
+ self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
+ self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
+ self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
+ self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
+ self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
+ }
+
+ pub fn do_move(&mut self, mv: Move) {
+
+ let atk: Side = self.p_turn;
+ let def: Side = side_opp(atk);
+
+ match self.p_turn {
+ Color::Black => { self.state[mv as usize] = Color::Black;
+
+ for i in 0..4 {
+ self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
+ self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
+ }
+ },
+
+ Color::White => { self.state[mv as usize] = Color::White;
+
+ for i in 0..4 {
+ self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
+ self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
+ }
+ },
+
+ Color::Empty => {},
+ }
+
+ self.p_last = mv;
+
+ self.p_turn = def;
+ }
+
+ fn turn(&self) -> Side {
+ self.p_turn
+ }
+
+ pub fn can_play(&self, from: Square) -> bool {
+
+ if self.state[from as usize] == Color::Empty { true } else { false }
+ }
+
+ pub fn count(&self, pc: Piece) -> i32 {
+
+ let mut n: i32 = 0;
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..FILE_SIZE {
+ let sq: Square = square_make(fl, rk);
+ if self.state[sq as usize] == pc { n += 1; }
+ }
+ }
+ n
+ }
+}
+
+pub struct List { // legal move list
+ p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
+ p_size: i32,
+}
+
+impl List {
+
+ pub fn clear(&mut self) {
+ self.p_size = 0;
+ }
+
+ pub fn add(&mut self, mv: Move) {
+ self.p_move[self.p_size as usize] = mv;
+ self.p_size += 1;
+ }
+
+ pub fn size(&self) -> i32 {
+ self.p_size
+ }
+
+ pub fn shuffle(&mut self) {
+
+ let mut rng = thread_rng();
+
+ let num = self.p_size;
+
+ let mut new_move: Vec = vec![];
+
+ for x in 0..(num as usize) {
+ new_move.push(self.p_move[x]);
+ }
+
+ new_move.shuffle(&mut rng);
+
+ for x in 0..(self.p_size as usize) {
+ self.p_move[x] = new_move[x];
+ }
+ }
+
+ //pub fn move(&self, i: i32) -> Move {
+ // self.p_move[i as usize]
+ //}
+}
+
+// functions
+//
+fn square_make(fl: i32, rk: i32) -> Square {
+ (rk + 4) * (FILE_SIZE + 4) + (fl + 4)
+}
+
+fn square_file(sq: Square) -> i32 {
+ sq % (FILE_SIZE + 4) - 4
+}
+
+fn square_rank(sq: Square) -> i32 {
+ sq / (FILE_SIZE + 4) - 4
+}
+
+fn side_opp(sd: Side) -> Side {
+
+ let mut out: Side;
+
+ match sd {
+ Side::White => out = Side::Black,
+ Side::Black => out = Side::White,
+ Side::Empty => panic!(""),
+ }
+
+ out
+}
+
+fn pos_is_winner(pos : &Pos) -> bool {
+
+ let current_side = side_opp(pos.p_turn);
+
+ let mut found : bool = true;
+
+ for x in 0..20 {
+ for y in 0..4 {
+
+ found = true;
+
+ let adj = pos.p_last + ENDCHECK[x][y];
+
+ if pos.state[adj as usize] != current_side { found = false; break }
+ }
+ if found == true { break; }
+ }
+
+ found
+}
+
+fn pos_is_winner_scan(pos : &Pos) -> bool {
+
+ let current_side = side_opp(pos.p_turn);
+
+ if check_patternfile5(&pos, current_side) ||
+ check_patternrank5(&pos, current_side) ||
+ check_patterndial5(&pos, current_side) ||
+ check_patterndiar5(&pos, current_side) { return true }
+
+ false
+}
+
+fn pos_is_draw(pos : &Pos) -> bool {
+
+
+ let mut found : bool = true;
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..FILE_SIZE {
+
+ let sq: Square = square_make(fl, rk);
+ if pos.can_play(sq) {
+ found = false;
+ break;
+ }
+
+ if found == false { break;}
+ }
+ }
+/*
+
+ let mut test: bool = false;
+
+ if pos.bitboard[Color::Empty as usize][0][0] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][1] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][2] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][3] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][4] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][5] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][6] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][7] == 0 &&
+ pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; }
+*/
+ //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); }
+
+ let mut out: bool = false;
+
+ //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; }
+ if found == true && !pos_is_winner_scan(pos) { out = true; }
+
+ out
+}
+
+fn pos_is_end(pos : &Pos) -> bool {
+
+ if pos_is_winner_scan(pos) || pos_is_draw(pos) {
+ true
+ } else {
+ false
+ }
+}
+
+fn pos_disp(pos: &Pos) {
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..FILE_SIZE {
+
+ let sq: Square = square_make(fl, rk);
+
+ match pos.state[sq as usize] {
+ Color::Black => print!("# "),
+ Color::White => print!("O "),
+ Color::Empty => print!("- "),
+ }
+ }
+
+ println!("");
+ }
+
+ match pos.turn() {
+ Color::Black => println!("black to play"),
+ Color::White => println!("white to play"),
+ _ => (),
+ }
+}
+
+fn gen_moves(list : &mut List, pos: &Pos) {
+
+ list.clear();
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+ if pos.can_play(sq) { list.add(sq); }
+ }
+ }
+
+}
+
+fn search(pos : &Pos, depth: i32, endgame: i32) -> Move {
+
+ //println!("call search");
+
+ let mut new_depth = depth;
+
+ let empties: i32 = pos.count(Color::Empty);
+ if (empties <= endgame || new_depth > empties ) { new_depth = empties; }
+
+ if(new_depth == empties) { unsafe { Endgame = true; } }
+
+ search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0)
+
+}
+
+fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 {
+
+
+ assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
+ //println!("call search_real");
+ //println!("depth = {}", depth);
+ //println!("ply = {}", ply);
+ // leaf?
+
+ //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); }
+ if pos_is_winner_scan(&pos) { return -EVAL_INF + ply }
+ //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply }
+
+
+ if pos_is_draw(&pos) { return 0 }
+
+ if depth == 0 {
+ return eval(&pos, ply)
+ }
+
+ let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize];
+
+ let mut list = List {
+ p_move: p_move_new,
+ p_size: 0,
+ };
+
+ let mut bm: Move = MOVE_NONE;
+ let mut bs: i32 = SCORE_NONE;
+
+ gen_moves(&mut list, &pos);
+
+ // move loop
+
+ if ply == 0 { list.shuffle(); }
+
+ for i in 0..list.size() {
+
+ if bs < beta {
+
+ let mv: Move = list.p_move[i as usize];
+
+ let mut new_pos = Pos {
+ state: pos.state,
+ p_turn: pos.p_turn,
+ p_last: pos.p_last,
+
+ bitboard: pos.bitboard,
+ };
+
+ //println!("p_last = {}", new_pos.p_last);
+
+ new_pos.do_move(mv);
+
+ //println!("After do _move p_last = {}", new_pos.p_last);
+
+ let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1);
+
+
+ //if sc >= 410 || sc <= -410 {
+ //println!("sc = {} depth = {}-------------------------------", sc, depth);
+
+ //pos_disp(&new_pos);
+ //}
+
+
+ if sc > bs { bm = mv; bs = sc; }
+
+ }
+ }
+
+ assert!(bm != MOVE_NONE);
+ assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
+
+ if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere
+ //bs
+}
+
+fn result(pos: &Pos) -> i32 {
+
+ if(pos_is_winner_scan(pos)) {
+ -(FILE_SIZE*RANK_SIZE*100)
+ } else {
+ 0
+ }
+}
+
+
+fn eval(pos: &Pos, ply: i32) -> i32 {
+
+ let atk: Side = pos.turn();
+ let def: Side = side_opp(atk);
+
+ //let mut sc: i32 = 0;
+
+ let check_live4: Side = def;
+ let check_live4_opp: Side = atk;
+
+ //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
+ //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
+/*
+ if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self "); pos_disp(&pos); }
+ if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp "); pos_disp(&pos); }
+
+ #[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern4_dead_avx512(&pos, check_live4_opp);
+
+ let mut temp_check: [__mmask16; 5] = [0; 5];
+
+ for i in 0..5 {
+
+ let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
+ let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
+ let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
+ let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
+ let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
+ let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
+ let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
+ let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
+ let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
+ let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
+
+ let check_mask10 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask11 = _kor_mask16(check_mask2, check_mask3);
+ let check_mask12 = _kor_mask16(check_mask4, check_mask5);
+ let check_mask13 = _kor_mask16(check_mask6, check_mask7);
+ let check_mask14 = _kor_mask16(check_mask8, check_mask9);
+
+ let check_mask16 = _kor_mask16(check_mask10, check_mask11);
+ let check_mask17 = _kor_mask16(check_mask12, check_mask13);
+ let check_mask18 = _kor_mask16(check_mask16, check_mask17);
+ temp_check[i] = _kor_mask16(check_mask18, check_mask14);
+
+ }
+
+ let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
+ let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
+ let check_mask2 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
+
+ let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp);
+
+ let mut test2: bool = true;
+
+ if check_mask3 > 0 { test2 = true; } else { test2 = false; }
+
+ if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp "); pos_disp(&pos); }
+ }
+
+ #[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern4_dead_avx512(&pos, check_live4);
+
+ let mut count: i32 = 0;
+
+ for i in 0..5 {
+ for j in 0..4 {
+ for k in 0..5 {
+ count += _popcnt32(result[i][j][k] as i32);
+ }
+ }
+ }
+
+ let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4);
+ let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4);
+ let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
+ let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
+
+
+ if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count); pos_disp(&pos); }
+ }
+
+ #[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern3_live_avx512(&pos, check_live4);
+
+ let mut count: i32 = 0;
+
+ for i in 0..3 {
+ for j in 0..4 {
+ for k in 0..5 {
+ count += _popcnt32(result[i][j][k] as i32);
+ }
+ }
+ }
+
+ let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
+ let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
+ let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
+ let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
+
+ let mut count1: i32 = 0;
+
+ count1 = c3f+c3r+c3dl+c3dr;
+
+ if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count); pos_disp(&pos); }
+ }
+*/
+
+ if check_patternfile4_once(&pos, check_live4) ||
+ check_patternrank4_once(&pos, check_live4) ||
+ check_patterndial4_once(&pos, check_live4) ||
+ check_patterndiar4_once(&pos, check_live4) { return -4096 }
+
+ //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 }
+
+ if check_patternfile4_once(&pos, check_live4_opp) ||
+ check_patternrank4_once(&pos, check_live4_opp) ||
+ check_patterndial4_once(&pos, check_live4_opp) ||
+ check_patterndiar4_once(&pos, check_live4_opp) { return 2560 }
+
+ //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 }
+
+ if check_patternfile4_dead(&pos, check_live4_opp) ||
+ check_patternrank4_dead(&pos, check_live4_opp) ||
+ check_patterndial4_dead(&pos, check_live4_opp) ||
+ check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 }
+
+ /*#[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern4_dead_avx512(&pos, check_live4_opp);
+
+ let mut temp_check: [__mmask16; 5] = [0; 5];
+
+ for i in 0..5 {
+ let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
+ let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
+ let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
+ let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
+ let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
+ let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
+ let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
+ let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
+ let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
+ let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
+
+ let check_mask10 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask11 = _kor_mask16(check_mask2, check_mask3);
+ let check_mask12 = _kor_mask16(check_mask4, check_mask5);
+ let check_mask13 = _kor_mask16(check_mask6, check_mask7);
+ let check_mask14 = _kor_mask16(check_mask8, check_mask9);
+
+ let check_mask16 = _kor_mask16(check_mask10, check_mask11);
+ let check_mask17 = _kor_mask16(check_mask12, check_mask13);
+ let check_mask18 = _kor_mask16(check_mask16, check_mask17);
+ temp_check[i] = _kor_mask16(check_mask18, check_mask14);
+ }
+
+ let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
+ let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
+ let check_mask2 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
+
+ if check_mask3 > 0 { return 2560 }
+ }
+*/
+ // 4,3
+ let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4);
+ let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4);
+ let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
+ let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
+
+ let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
+ let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
+ let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
+ let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
+
+ let n_c4: i32 = c4f + c4r + c4dl + c4dr;
+
+ if n_c4 > 1 { return -2048 }
+
+ if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 }
+
+/*
+ #[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern4_dead_avx512(&pos, check_live4);
+
+ let mut count4: i32 = 0;
+
+ for i in 0..5 {
+ for j in 0..4 {
+ for k in 0..5 {
+ count4 += _popcnt32(result[i][j][k] as i32);
+ }
+ }
+ }
+
+ if count4 > 1 { return -2048 }
+ else if count4 == 1 {
+
+ let result = check_pattern3_live_avx512(&pos, check_live4);
+
+ let mut count3: i32 = 0;
+
+ for i in 0..3 {
+ for j in 0..4 {
+ for k in 0..5 {
+ count3 += _popcnt32(result[i][j][k] as i32);
+ }
+ }
+ }
+
+ if count3 > 0 { return -3048 }
+ }
+ }
+ */
+ //---------------------------------------------------------------------------
+
+ let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp);
+ let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp);
+ let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp);
+ let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp);
+ if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 }
+
+ if c3f + c3r + c3dl + c3dr > 1 { return -2048 }
+ /*
+ #[target_feature(enable = "avx512f")]
+ unsafe {
+ let result = check_pattern3_live_avx512(&pos, check_live4_opp);
+
+ let mut count: i32 = 0;
+
+ for i in 0..3 {
+ for j in 0..4 {
+ for k in 0..5 {
+ count += _popcnt32(result[i][j][k] as i32);
+ }
+ }
+ }
+
+ let c3f: i32 = check_patternfile3_live_n(&pos, check_live4_opp);
+ let c3r: i32 = check_patternrank3_live_n(&pos, check_live4_opp);
+ let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp);
+ let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp);
+
+ let mut count1: i32 = 0;
+
+ count1 = c3f+c3r+c3dl+c3dr;
+
+ if count1 > 1 { return -2048 }
+ }
+*/
+ 0
+}
+
+
+fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+ let idx5 = sq + PATTERNFILE4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+ let idx5 = sq + PATTERNRANK4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+ let idx5 = sq + PATTERNDIAL4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 5..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+ let idx5 = sq + PATTERNDIAR4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 4..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+
+fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+ let mut n: i32 = 0;
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 {
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 4..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
+ if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
+ if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
+ }
+ }
+
+ n
+}
+
+
+/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool {
+
+ let last_move: Move = pos.p_last;
+
+ let mut n: i32 = 0;
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ }
+ }
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+ let idx5 = sq + PATTERNFILE4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool {
+
+ let last_move: Move = pos.p_last;
+
+ // let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+ let idx5 = sq + PATTERNRANK4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool {
+
+ let last_move: Move = pos.p_last;
+
+ //let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+ let idx5 = sq + PATTERNDIAL4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool {
+
+ let last_move: Move = pos.p_last;
+
+ //let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 4..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 5..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+ let idx5 = sq + PATTERNDIAR4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
+ }
+ }
+
+ false
+}
+*/
+
+fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+ let last_move: Move = pos.p_last;
+
+ let mut n: i32 = 0;
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; }
+ }
+ }
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+ let idx5 = sq + PATTERNFILE4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+ let last_move: Move = pos.p_last;
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+ let idx5 = sq + PATTERNRANK4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+ let last_move: Move = pos.p_last;
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 0..(FILE_SIZE - 5) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+ let idx5 = sq + PATTERNDIAL4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+ }
+ }
+
+ n
+}
+
+fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 {
+
+ let last_move: Move = pos.p_last;
+
+ let mut n: i32 = 0;
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 4..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
+ }
+ }
+
+ for rk in 0..(RANK_SIZE - 5) {
+ for fl in 5..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+ let idx5 = sq + PATTERNDIAR4[5];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+ let val5 = pos.state[idx5 as usize];
+
+ if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
+ if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
+ }
+ }
+
+ n
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool {
+
+ let current_side = side_opp(pos.p_turn);
+
+ let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) );
+
+ let answer_mask: __mmask16 = 0b01111111_11111111;
+
+ let coloridx = current_side as usize;
+
+ let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4];
+
+ for dir in 0..4 {
+ let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+ let boardf = _mm512_and_epi32(answer, board0);
+
+ temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+ for i in 1..5 {
+
+ let board1 = _mm512_rol_epi32(board0, i);
+
+ let boardf = _mm512_and_epi32(answer, board1);
+
+ temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+ }
+ }
+
+ let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]);
+ let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]);
+ let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]);
+ let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]);
+ let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]);
+ let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]);
+ let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]);
+ let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]);
+ let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]);
+ let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]);
+
+ let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3);
+ let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5);
+ let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7);
+ let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9);
+
+ let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11);
+ let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13);
+ let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17);
+ let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14);
+
+ if check_mask19 > 0 { return true } else { return false }
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool {
+
+ //let current_side = side_opp(sd);
+
+ let answer_color = _mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) );
+ let answer_empty = _mm512_set1_epi32( (1<<31)| (1<<26) );
+ let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26));
+
+ let answer_mask: __mmask16 = 0b00000001_11111111;
+
+ //let coloridx = current_side as usize;
+ let coloridx = sd as usize;
+ let emptyidx = Color::Empty as usize;
+
+ let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4];
+
+ for dir in 0..4 {
+
+ let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+ let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+ let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd
+ let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+ for i in 1..4 { //only move 3 times
+
+ let board2 = _mm512_rol_epi32(board0, i);//rot sd
+ let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+ let boardf1 = _mm512_and_epi32(answer_color, board2);
+ let boardf2 = _mm512_and_epi32(answer_empty, board3);
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+ }
+ }
+
+ let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]);
+ let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]);
+ let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]);
+ let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]);
+ let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]);
+ let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]);
+ let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]);
+ let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]);
+
+ let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1);
+ let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3);
+ let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5);
+ let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7);
+
+ let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11);
+ let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13);
+ let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17);
+
+ if check_mask19 > 0 { return true } else { return false }
+}
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] {
+
+ //let current_side = side_opp(sd);
+
+ let answer_color: [__m512i; 5] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ),
+ _mm512_set1_epi32( (1<<31)| (1<<29)|(1<<28)|(1<<27) ),
+ _mm512_set1_epi32( (1<<31)|(1<<30) |(1<<28)|(1<<27) ),
+ _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29) |(1<<27) ),
+ _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28) )];
+
+ let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ),
+ _mm512_set1_epi32( (1<<30) ),
+ _mm512_set1_epi32( (1<<29) ),
+ _mm512_set1_epi32( (1<<28) ),
+ _mm512_set1_epi32( (1<<27) )];
+
+ let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
+
+ let answer_mask: __mmask16 = 0b00000001_11111111;
+
+ //let coloridx = current_side as usize;
+ let coloridx = sd as usize;
+ let emptyidx = Color::Empty as usize;
+
+ let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5];
+
+ for pattern in 0..5 {
+
+ for dir in 0..4 {
+
+ let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+ let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+ let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
+ let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+
+ for i in 1..5 { //only move 4 times
+
+ let board2 = _mm512_rol_epi32(board0, i);//rot sd
+ let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+ let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
+ let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
+ }
+ }
+ }
+
+ temp_mask
+}
+
+
+#[target_feature(enable = "avx512f")]
+unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] {
+
+ //let current_side = side_opp(sd);
+
+ let answer_color: [__m512i; 3] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28) ),
+ _mm512_set1_epi32( (1<<30)| (1<<28)|(1<<27) ),
+ _mm512_set1_epi32( (1<<30)|(1<<29) |(1<<27) )];
+
+ let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)| (1<<27) ),
+ _mm512_set1_epi32( (1<<31)| (1<<29)| (1<<26) ),
+ _mm512_set1_epi32( (1<<31)| (1<<28)| (1<<26) )];
+
+ //let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
+ let answer: [__m512i; 3] = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ),
+ _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ),
+ _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )];
+
+ let answer_mask: __mmask16 = 0b00000001_11111111;
+
+ //let coloridx = current_side as usize;
+ let coloridx = sd as usize;
+ let emptyidx = Color::Empty as usize;
+
+ let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3];
+
+ for pattern in 0..3 {
+
+ for dir in 0..4 {
+
+ let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
+
+ let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
+
+ let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
+ let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
+
+ for i in 1..5 { //only move 4 times
+
+ let board2 = _mm512_rol_epi32(board0, i);//rot sd
+ let board3 = _mm512_rol_epi32(board1, i);//rot empty
+
+ let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
+ let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
+ let boardf = _mm512_or_epi32(boardf1, boardf2);
+
+ temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
+ }
+ }
+ }
+
+ temp_mask
+}
+
+fn check_patternfile5(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..RANK_SIZE {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNFILE4[0];
+ let idx1 = sq + PATTERNFILE4[1];
+ let idx2 = sq + PATTERNFILE4[2];
+ let idx3 = sq + PATTERNFILE4[3];
+ let idx4 = sq + PATTERNFILE4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patternrank5(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNRANK4[0];
+ let idx1 = sq + PATTERNRANK4[1];
+ let idx2 = sq + PATTERNRANK4[2];
+ let idx3 = sq + PATTERNRANK4[3];
+ let idx4 = sq + PATTERNRANK4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndial5(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 0..(FILE_SIZE - 4) {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAL4[0];
+ let idx1 = sq + PATTERNDIAL4[1];
+ let idx2 = sq + PATTERNDIAL4[2];
+ let idx3 = sq + PATTERNDIAL4[3];
+ let idx4 = sq + PATTERNDIAL4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn check_patterndiar5(pos: &Pos, sd: Side) -> bool {
+
+ for rk in 0..(RANK_SIZE - 4) {
+ for fl in 4..FILE_SIZE {
+ let sq : Square = square_make(fl, rk);
+
+ let idx0 = sq + PATTERNDIAR4[0];
+ let idx1 = sq + PATTERNDIAR4[1];
+ let idx2 = sq + PATTERNDIAR4[2];
+ let idx3 = sq + PATTERNDIAR4[3];
+ let idx4 = sq + PATTERNDIAR4[4];
+
+ let val0 = pos.state[idx0 as usize];
+ let val1 = pos.state[idx1 as usize];
+ let val2 = pos.state[idx2 as usize];
+ let val3 = pos.state[idx3 as usize];
+ let val4 = pos.state[idx4 as usize];
+
+ if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
+ }
+ }
+
+ false
+}
+
+fn main() {
+
+ loop
+ {
+
+ let start = Instant::now();
+
+ println!("Hello, this is connect 6!");
+
+ //unsafe { test_avx512(); }
+
+ let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
+
+ let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3];
+
+ let mut test1 = Pos {
+ state: test_state,
+ p_turn: Color::Black,
+ p_last: square_make(5,5),
+
+ bitboard: test_bitboard,
+ };
+
+ test1.init();
+
+ //pos_disp(&test1);
+
+ for i in 0..(FILE_SIZE*RANK_SIZE) {
+
+ // println!("----------------------------------------\n\n\n\n");
+ // println!("MOVE {}!!!!\n\n\n\n", i);
+
+
+ let mut d = 2;
+ let mut e = 4;
+
+ //if i < 6 { d = 1; e = 2; }
+
+ let next_move: Move = search(&test1, d, e);
+ //println!("next move is {}", next_move);
+ //println!("file is {}", square_file(next_move));
+ //println!("rank is {}", square_rank(next_move));
+
+ test1.do_move(next_move);
+
+ //pos_disp(&test1);
+
+ if pos_is_end(&test1) {
+
+ println!("Game over!!!!!!");
+ println!("MOVE {}!!!!\n", i);
+ //pos_disp(&test1);
+
+ break; }
+ }
+
+
+ let duration = start.elapsed();
+
+ println!("Time elapsed in expensive_function() is: {:?}", duration);
+ }
+
+
+}
From b4023d1bdb6fd4e0a3c1c5cf282724567953c14e Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 13:32:07 +0000
Subject: [PATCH 15/25] mask_add: ss,sd
---
examples/connect5_14_avx512.rs | 2100 --------------------------------
1 file changed, 2100 deletions(-)
delete mode 100644 examples/connect5_14_avx512.rs
diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs
deleted file mode 100644
index 696adfc70c..0000000000
--- a/examples/connect5_14_avx512.rs
+++ /dev/null
@@ -1,2100 +0,0 @@
-#![feature(stdsimd, avx512_target_feature)]
-
-#[cfg(target_arch = "x86")]
-use {core_arch::arch::x86::*};
-#[cfg(target_arch = "x86_64")]
-use {core_arch::arch::x86_64::*};
-
-
-use rand::seq::SliceRandom;
-use rand::thread_rng;
-use rand::Rng;
-
-use std::cmp;
-
-use std::time::{Duration, Instant};
-
-// types
-
-#[derive(Clone, Copy, PartialEq, Eq)]
-pub enum Color {
- Black,
- White,
- Empty,
-}
-
-type Square = i32;
-type Move = i32;
-type Side = Color;
-type Piece = Color;
-
-// constants
-
-const FILE_SIZE: i32 = 15;
-const RANK_SIZE: i32 = 15;
-const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4;
-
-const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100);
-const MOVE_NONE: Move = -1;
-const SCORE_NONE: i32 = -EVAL_INF - 1;
-
-const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1],
- [-3, -2, -1, 1],
- [-2, -1, 1, 2],
- [-1, 1, 2, 3],
- [ 1, 2, 3, 4],
-
- [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)],
- [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)],
- [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)],
- [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)],
- [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)],
-
- [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)],
- [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)],
- [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)],
- [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)],
- [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)],
-
- [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)],
- [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)],
- [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)],
- [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)],
- [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)]
- ];
-
-const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6];
-const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)];
-const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)];
-const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)];
-
-const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
- 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30,
- 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29,
- 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28,
- 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
- 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
- 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
- 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
- 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
- 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
- 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
- 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20,
- 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19,
- 0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18,
- 0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0, 0, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0, 0,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22,
- 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21,
- 0, 0, 0, 0, 0, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20,
- 0, 0, 0, 0, 0, 0, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19,
- 0, 0, 0, 0, 0, 0, 0, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18,
- 0, 0, 0, 0, 0, 0, 0, 0, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31,
- 0, 0, 0, 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31,
- 0, 0, 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
- 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0,
- 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0, 0,
- 0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0, 0, 0,
- 0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0, 0, 0, 0]
- ];
-
-const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
- 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
- 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
- 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
- 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
- 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
- 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
- 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0,
- 0, 0, 0, 0, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0,
- 0, 0, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0,
- 0, 0, 0, 0, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
- 0, 0, 0, 0, 15, 14, 13, 6, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
- 0, 0, 0, 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
- 0, 0, 0, 0, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3,
- 0, 0, 0, 0, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4,
- 0, 0, 0, 0, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
- 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6,
- 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7,
- 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8,
- 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9
- 0, 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10],
-
- [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
- 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
- 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
- 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
- 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
- 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
- 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
- 0, 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
- 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0,
- 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0,
- 0, 0, 0, 0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0,
- 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0]
- ];
-
-// variables
-
-static mut Endgame: bool = false;
-
-// structures
-
-pub struct Pos { // position
- state: [Color; SQUARE_SIZE as usize],
- p_turn: Side,
- p_last: Move,
-
- bitboard: [[[i32; 20]; 4]; 3],
-
-}
-
-impl Pos {
-
- pub fn init(&mut self) { // starting position
- for i in 0..SQUARE_SIZE as usize {
- self.state[i] = Color::Empty;
- }
-
- self.p_turn = Color::Black;
- self.p_last = square_make(0, 0);
-
- //--------------------------------------------
-
- for i in 0..4 {
- for j in 0..20 {
- self.bitboard[Color::Black as usize][i][j] = 0;
- }
- }
-
- for i in 0..4 {
- for j in 0..20 {
- self.bitboard[Color::White as usize][i][j] = 0;
- }
- }
-
- for i in 0..2 {
- for j in 0..20 {
- self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
- }
- }
-
- self.bitboard[Color::Empty as usize][2][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
- self.bitboard[Color::Empty as usize][2][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
- self.bitboard[Color::Empty as usize][2][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
- self.bitboard[Color::Empty as usize][2][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
- self.bitboard[Color::Empty as usize][2][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
- self.bitboard[Color::Empty as usize][2][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
- self.bitboard[Color::Empty as usize][2][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
- self.bitboard[Color::Empty as usize][2][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
- self.bitboard[Color::Empty as usize][2][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
- self.bitboard[Color::Empty as usize][2][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
- self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
- self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
- self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
- self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
- self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
- self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
- self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
- self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
- self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
- self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
-
- self.bitboard[Color::Empty as usize][3][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
- self.bitboard[Color::Empty as usize][3][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
- self.bitboard[Color::Empty as usize][3][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
- self.bitboard[Color::Empty as usize][3][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
- self.bitboard[Color::Empty as usize][3][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
- self.bitboard[Color::Empty as usize][3][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
- self.bitboard[Color::Empty as usize][3][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
- self.bitboard[Color::Empty as usize][3][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
- self.bitboard[Color::Empty as usize][3][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19);
- self.bitboard[Color::Empty as usize][3][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
- self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17);
- self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18);
- self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20);
- self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21);
- self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22);
- self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23);
- self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24);
- self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25);
- self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26);
- self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27);
- }
-
- pub fn do_move(&mut self, mv: Move) {
-
- let atk: Side = self.p_turn;
- let def: Side = side_opp(atk);
-
- match self.p_turn {
- Color::Black => { self.state[mv as usize] = Color::Black;
-
- for i in 0..4 {
- self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
- self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
- }
- },
-
- Color::White => { self.state[mv as usize] = Color::White;
-
- for i in 0..4 {
- self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize];
- self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize];
- }
- },
-
- Color::Empty => {},
- }
-
- self.p_last = mv;
-
- self.p_turn = def;
- }
-
- fn turn(&self) -> Side {
- self.p_turn
- }
-
- pub fn can_play(&self, from: Square) -> bool {
-
- if self.state[from as usize] == Color::Empty { true } else { false }
- }
-
- pub fn count(&self, pc: Piece) -> i32 {
-
- let mut n: i32 = 0;
-
- for rk in 0..RANK_SIZE {
- for fl in 0..FILE_SIZE {
- let sq: Square = square_make(fl, rk);
- if self.state[sq as usize] == pc { n += 1; }
- }
- }
- n
- }
-}
-
-pub struct List { // legal move list
- p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize],
- p_size: i32,
-}
-
-impl List {
-
- pub fn clear(&mut self) {
- self.p_size = 0;
- }
-
- pub fn add(&mut self, mv: Move) {
- self.p_move[self.p_size as usize] = mv;
- self.p_size += 1;
- }
-
- pub fn size(&self) -> i32 {
- self.p_size
- }
-
- pub fn shuffle(&mut self) {
-
- let mut rng = thread_rng();
-
- let num = self.p_size;
-
- let mut new_move: Vec = vec![];
-
- for x in 0..(num as usize) {
- new_move.push(self.p_move[x]);
- }
-
- new_move.shuffle(&mut rng);
-
- for x in 0..(self.p_size as usize) {
- self.p_move[x] = new_move[x];
- }
- }
-
- //pub fn move(&self, i: i32) -> Move {
- // self.p_move[i as usize]
- //}
-}
-
-// functions
-//
-fn square_make(fl: i32, rk: i32) -> Square {
- (rk + 4) * (FILE_SIZE + 4) + (fl + 4)
-}
-
-fn square_file(sq: Square) -> i32 {
- sq % (FILE_SIZE + 4) - 4
-}
-
-fn square_rank(sq: Square) -> i32 {
- sq / (FILE_SIZE + 4) - 4
-}
-
-fn side_opp(sd: Side) -> Side {
-
- let mut out: Side;
-
- match sd {
- Side::White => out = Side::Black,
- Side::Black => out = Side::White,
- Side::Empty => panic!(""),
- }
-
- out
-}
-
-fn pos_is_winner(pos : &Pos) -> bool {
-
- let current_side = side_opp(pos.p_turn);
-
- let mut found : bool = true;
-
- for x in 0..20 {
- for y in 0..4 {
-
- found = true;
-
- let adj = pos.p_last + ENDCHECK[x][y];
-
- if pos.state[adj as usize] != current_side { found = false; break }
- }
- if found == true { break; }
- }
-
- found
-}
-
-fn pos_is_winner_scan(pos : &Pos) -> bool {
-
- let current_side = side_opp(pos.p_turn);
-
- if check_patternfile5(&pos, current_side) ||
- check_patternrank5(&pos, current_side) ||
- check_patterndial5(&pos, current_side) ||
- check_patterndiar5(&pos, current_side) { return true }
-
- false
-}
-
-fn pos_is_draw(pos : &Pos) -> bool {
-
-
- let mut found : bool = true;
-
- for rk in 0..RANK_SIZE {
- for fl in 0..FILE_SIZE {
-
- let sq: Square = square_make(fl, rk);
- if pos.can_play(sq) {
- found = false;
- break;
- }
-
- if found == false { break;}
- }
- }
-/*
-
- let mut test: bool = false;
-
- if pos.bitboard[Color::Empty as usize][0][0] == 0 &&
- pos.bitboard[Color::Empty as usize][0][1] == 0 &&
- pos.bitboard[Color::Empty as usize][0][2] == 0 &&
- pos.bitboard[Color::Empty as usize][0][3] == 0 &&
- pos.bitboard[Color::Empty as usize][0][4] == 0 &&
- pos.bitboard[Color::Empty as usize][0][5] == 0 &&
- pos.bitboard[Color::Empty as usize][0][6] == 0 &&
- pos.bitboard[Color::Empty as usize][0][7] == 0 &&
- pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; }
-*/
- //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); }
-
- let mut out: bool = false;
-
- //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; }
- if found == true && !pos_is_winner_scan(pos) { out = true; }
-
- out
-}
-
-fn pos_is_end(pos : &Pos) -> bool {
-
- if pos_is_winner_scan(pos) || pos_is_draw(pos) {
- true
- } else {
- false
- }
-}
-
-fn pos_disp(pos: &Pos) {
-
- for rk in 0..RANK_SIZE {
- for fl in 0..FILE_SIZE {
-
- let sq: Square = square_make(fl, rk);
-
- match pos.state[sq as usize] {
- Color::Black => print!("# "),
- Color::White => print!("O "),
- Color::Empty => print!("- "),
- }
- }
-
- println!("");
- }
-
- match pos.turn() {
- Color::Black => println!("black to play"),
- Color::White => println!("white to play"),
- _ => (),
- }
-}
-
-fn gen_moves(list : &mut List, pos: &Pos) {
-
- list.clear();
-
- for rk in 0..RANK_SIZE {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
- if pos.can_play(sq) { list.add(sq); }
- }
- }
-
-}
-
-fn search(pos : &Pos, depth: i32, endgame: i32) -> Move {
-
- //println!("call search");
-
- let mut new_depth = depth;
-
- let empties: i32 = pos.count(Color::Empty);
- if (empties <= endgame || new_depth > empties ) { new_depth = empties; }
-
- if(new_depth == empties) { unsafe { Endgame = true; } }
-
- search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0)
-
-}
-
-fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 {
-
-
- assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF);
- //println!("call search_real");
- //println!("depth = {}", depth);
- //println!("ply = {}", ply);
- // leaf?
-
- //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); }
- if pos_is_winner_scan(&pos) { return -EVAL_INF + ply }
- //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply }
-
-
- if pos_is_draw(&pos) { return 0 }
-
- if depth == 0 {
- return eval(&pos, ply)
- }
-
- let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize];
-
- let mut list = List {
- p_move: p_move_new,
- p_size: 0,
- };
-
- let mut bm: Move = MOVE_NONE;
- let mut bs: i32 = SCORE_NONE;
-
- gen_moves(&mut list, &pos);
-
- // move loop
-
- if ply == 0 { list.shuffle(); }
-
- for i in 0..list.size() {
-
- if bs < beta {
-
- let mv: Move = list.p_move[i as usize];
-
- let mut new_pos = Pos {
- state: pos.state,
- p_turn: pos.p_turn,
- p_last: pos.p_last,
-
- bitboard: pos.bitboard,
- };
-
- //println!("p_last = {}", new_pos.p_last);
-
- new_pos.do_move(mv);
-
- //println!("After do _move p_last = {}", new_pos.p_last);
-
- let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1);
-
-
- //if sc >= 410 || sc <= -410 {
- //println!("sc = {} depth = {}-------------------------------", sc, depth);
-
- //pos_disp(&new_pos);
- //}
-
-
- if sc > bs { bm = mv; bs = sc; }
-
- }
- }
-
- assert!(bm != MOVE_NONE);
- assert!(bs >= -EVAL_INF && bs <= EVAL_INF);
-
- if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere
- //bs
-}
-
-fn result(pos: &Pos) -> i32 {
-
- if(pos_is_winner_scan(pos)) {
- -(FILE_SIZE*RANK_SIZE*100)
- } else {
- 0
- }
-}
-
-
-fn eval(pos: &Pos, ply: i32) -> i32 {
-
- let atk: Side = pos.turn();
- let def: Side = side_opp(atk);
-
- //let mut sc: i32 = 0;
-
- let check_live4: Side = def;
- let check_live4_opp: Side = atk;
-
- //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
- //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; }
-/*
- if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self "); pos_disp(&pos); }
- if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp "); pos_disp(&pos); }
-
- #[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern4_dead_avx512(&pos, check_live4_opp);
-
- let mut temp_check: [__mmask16; 5] = [0; 5];
-
- for i in 0..5 {
-
- let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
- let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
- let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
- let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
- let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
- let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
- let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
- let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
- let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
- let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
-
- let check_mask10 = _kor_mask16(check_mask0, check_mask1);
- let check_mask11 = _kor_mask16(check_mask2, check_mask3);
- let check_mask12 = _kor_mask16(check_mask4, check_mask5);
- let check_mask13 = _kor_mask16(check_mask6, check_mask7);
- let check_mask14 = _kor_mask16(check_mask8, check_mask9);
-
- let check_mask16 = _kor_mask16(check_mask10, check_mask11);
- let check_mask17 = _kor_mask16(check_mask12, check_mask13);
- let check_mask18 = _kor_mask16(check_mask16, check_mask17);
- temp_check[i] = _kor_mask16(check_mask18, check_mask14);
-
- }
-
- let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
- let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
- let check_mask2 = _kor_mask16(check_mask0, check_mask1);
- let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
-
- let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp);
-
- let mut test2: bool = true;
-
- if check_mask3 > 0 { test2 = true; } else { test2 = false; }
-
- if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp "); pos_disp(&pos); }
- }
-
- #[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern4_dead_avx512(&pos, check_live4);
-
- let mut count: i32 = 0;
-
- for i in 0..5 {
- for j in 0..4 {
- for k in 0..5 {
- count += _popcnt32(result[i][j][k] as i32);
- }
- }
- }
-
- let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4);
- let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4);
- let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
- let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
-
-
- if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count); pos_disp(&pos); }
- }
-
- #[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern3_live_avx512(&pos, check_live4);
-
- let mut count: i32 = 0;
-
- for i in 0..3 {
- for j in 0..4 {
- for k in 0..5 {
- count += _popcnt32(result[i][j][k] as i32);
- }
- }
- }
-
- let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
- let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
- let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
- let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
-
- let mut count1: i32 = 0;
-
- count1 = c3f+c3r+c3dl+c3dr;
-
- if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count); pos_disp(&pos); }
- }
-*/
-
- if check_patternfile4_once(&pos, check_live4) ||
- check_patternrank4_once(&pos, check_live4) ||
- check_patterndial4_once(&pos, check_live4) ||
- check_patterndiar4_once(&pos, check_live4) { return -4096 }
-
- //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 }
-
- if check_patternfile4_once(&pos, check_live4_opp) ||
- check_patternrank4_once(&pos, check_live4_opp) ||
- check_patterndial4_once(&pos, check_live4_opp) ||
- check_patterndiar4_once(&pos, check_live4_opp) { return 2560 }
-
- //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 }
-
- if check_patternfile4_dead(&pos, check_live4_opp) ||
- check_patternrank4_dead(&pos, check_live4_opp) ||
- check_patterndial4_dead(&pos, check_live4_opp) ||
- check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 }
-
- /*#[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern4_dead_avx512(&pos, check_live4_opp);
-
- let mut temp_check: [__mmask16; 5] = [0; 5];
-
- for i in 0..5 {
- let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]);
- let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]);
- let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]);
- let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]);
- let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]);
- let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]);
- let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]);
- let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]);
- let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]);
- let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]);
-
- let check_mask10 = _kor_mask16(check_mask0, check_mask1);
- let check_mask11 = _kor_mask16(check_mask2, check_mask3);
- let check_mask12 = _kor_mask16(check_mask4, check_mask5);
- let check_mask13 = _kor_mask16(check_mask6, check_mask7);
- let check_mask14 = _kor_mask16(check_mask8, check_mask9);
-
- let check_mask16 = _kor_mask16(check_mask10, check_mask11);
- let check_mask17 = _kor_mask16(check_mask12, check_mask13);
- let check_mask18 = _kor_mask16(check_mask16, check_mask17);
- temp_check[i] = _kor_mask16(check_mask18, check_mask14);
- }
-
- let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]);
- let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]);
- let check_mask2 = _kor_mask16(check_mask0, check_mask1);
- let check_mask3 = _kor_mask16(check_mask2, temp_check[4]);
-
- if check_mask3 > 0 { return 2560 }
- }
-*/
- // 4,3
- let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4);
- let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4);
- let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4);
- let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4);
-
- let c3f: i32 = check_patternfile3_live_n(&pos, check_live4);
- let c3r: i32 = check_patternrank3_live_n(&pos, check_live4);
- let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4);
- let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4);
-
- let n_c4: i32 = c4f + c4r + c4dl + c4dr;
-
- if n_c4 > 1 { return -2048 }
-
- if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 }
-
-/*
- #[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern4_dead_avx512(&pos, check_live4);
-
- let mut count4: i32 = 0;
-
- for i in 0..5 {
- for j in 0..4 {
- for k in 0..5 {
- count4 += _popcnt32(result[i][j][k] as i32);
- }
- }
- }
-
- if count4 > 1 { return -2048 }
- else if count4 == 1 {
-
- let result = check_pattern3_live_avx512(&pos, check_live4);
-
- let mut count3: i32 = 0;
-
- for i in 0..3 {
- for j in 0..4 {
- for k in 0..5 {
- count3 += _popcnt32(result[i][j][k] as i32);
- }
- }
- }
-
- if count3 > 0 { return -3048 }
- }
- }
- */
- //---------------------------------------------------------------------------
-
- let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp);
- let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp);
- let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp);
- let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp);
- if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 }
-
- if c3f + c3r + c3dl + c3dr > 1 { return -2048 }
- /*
- #[target_feature(enable = "avx512f")]
- unsafe {
- let result = check_pattern3_live_avx512(&pos, check_live4_opp);
-
- let mut count: i32 = 0;
-
- for i in 0..3 {
- for j in 0..4 {
- for k in 0..5 {
- count += _popcnt32(result[i][j][k] as i32);
- }
- }
- }
-
- let c3f: i32 = check_patternfile3_live_n(&pos, check_live4_opp);
- let c3r: i32 = check_patternrank3_live_n(&pos, check_live4_opp);
- let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp);
- let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp);
-
- let mut count1: i32 = 0;
-
- count1 = c3f+c3r+c3dl+c3dr;
-
- if count1 > 1 { return -2048 }
- }
-*/
- 0
-}
-
-
-fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
- let idx5 = sq + PATTERNFILE4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
- let idx5 = sq + PATTERNRANK4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
- let idx5 = sq + PATTERNDIAL4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 5..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
- let idx5 = sq + PATTERNDIAR4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 4..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-
-fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
- let mut n: i32 = 0;
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 {
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 4..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; }
- if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; }
- if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; }
- }
- }
-
- n
-}
-
-
-/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool {
-
- let last_move: Move = pos.p_last;
-
- let mut n: i32 = 0;
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- }
- }
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
- let idx5 = sq + PATTERNFILE4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool {
-
- let last_move: Move = pos.p_last;
-
- // let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
- let idx5 = sq + PATTERNRANK4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool {
-
- let last_move: Move = pos.p_last;
-
- //let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
- let idx5 = sq + PATTERNDIAL4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-
-fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool {
-
- let last_move: Move = pos.p_last;
-
- //let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 4..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 5..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
- let idx5 = sq + PATTERNDIAR4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true }
- }
- }
-
- false
-}
-*/
-
-fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 {
-
- let last_move: Move = pos.p_last;
-
- let mut n: i32 = 0;
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; }
- }
- }
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
- let idx5 = sq + PATTERNFILE4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 {
-
- let last_move: Move = pos.p_last;
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
- let idx5 = sq + PATTERNRANK4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 {
-
- let last_move: Move = pos.p_last;
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 0..(FILE_SIZE - 5) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
- let idx5 = sq + PATTERNDIAL4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
- }
- }
-
- n
-}
-
-fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 {
-
- let last_move: Move = pos.p_last;
-
- let mut n: i32 = 0;
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 4..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; }
- }
- }
-
- for rk in 0..(RANK_SIZE - 5) {
- for fl in 5..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
- let idx5 = sq + PATTERNDIAR4[5];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
- let val5 = pos.state[idx5 as usize];
-
- if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; }
- if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; }
- }
- }
-
- n
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool {
-
- let current_side = side_opp(pos.p_turn);
-
- let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) );
-
- let answer_mask: __mmask16 = 0b01111111_11111111;
-
- let coloridx = current_side as usize;
-
- let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4];
-
- for dir in 0..4 {
- let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
- let boardf = _mm512_and_epi32(answer, board0);
-
- temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
- for i in 1..5 {
-
- let board1 = _mm512_rol_epi32(board0, i);
-
- let boardf = _mm512_and_epi32(answer, board1);
-
- temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
- }
- }
-
- let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]);
- let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]);
- let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]);
- let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]);
- let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]);
- let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]);
- let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]);
- let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]);
- let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]);
- let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]);
-
- let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1);
- let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3);
- let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5);
- let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7);
- let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9);
-
- let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11);
- let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13);
- let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17);
- let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14);
-
- if check_mask19 > 0 { return true } else { return false }
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool {
-
- //let current_side = side_opp(sd);
-
- let answer_color = _mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) );
- let answer_empty = _mm512_set1_epi32( (1<<31)| (1<<26) );
- let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26));
-
- let answer_mask: __mmask16 = 0b00000001_11111111;
-
- //let coloridx = current_side as usize;
- let coloridx = sd as usize;
- let emptyidx = Color::Empty as usize;
-
- let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4];
-
- for dir in 0..4 {
-
- let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
- let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
- let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd
- let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
- for i in 1..4 { //only move 3 times
-
- let board2 = _mm512_rol_epi32(board0, i);//rot sd
- let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
- let boardf1 = _mm512_and_epi32(answer_color, board2);
- let boardf2 = _mm512_and_epi32(answer_empty, board3);
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
- }
- }
-
- let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]);
- let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]);
- let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]);
- let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]);
- let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]);
- let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]);
- let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]);
- let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]);
-
- let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1);
- let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3);
- let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5);
- let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7);
-
- let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11);
- let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13);
- let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17);
-
- if check_mask19 > 0 { return true } else { return false }
-}
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] {
-
- //let current_side = side_opp(sd);
-
- let answer_color: [__m512i; 5] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ),
- _mm512_set1_epi32( (1<<31)| (1<<29)|(1<<28)|(1<<27) ),
- _mm512_set1_epi32( (1<<31)|(1<<30) |(1<<28)|(1<<27) ),
- _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29) |(1<<27) ),
- _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28) )];
-
- let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ),
- _mm512_set1_epi32( (1<<30) ),
- _mm512_set1_epi32( (1<<29) ),
- _mm512_set1_epi32( (1<<28) ),
- _mm512_set1_epi32( (1<<27) )];
-
- let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
-
- let answer_mask: __mmask16 = 0b00000001_11111111;
-
- //let coloridx = current_side as usize;
- let coloridx = sd as usize;
- let emptyidx = Color::Empty as usize;
-
- let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5];
-
- for pattern in 0..5 {
-
- for dir in 0..4 {
-
- let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
- let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
- let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
- let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
-
- for i in 1..5 { //only move 4 times
-
- let board2 = _mm512_rol_epi32(board0, i);//rot sd
- let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
- let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
- let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above
- }
- }
- }
-
- temp_mask
-}
-
-
-#[target_feature(enable = "avx512f")]
-unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] {
-
- //let current_side = side_opp(sd);
-
- let answer_color: [__m512i; 3] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28) ),
- _mm512_set1_epi32( (1<<30)| (1<<28)|(1<<27) ),
- _mm512_set1_epi32( (1<<30)|(1<<29) |(1<<27) )];
-
- let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)| (1<<27) ),
- _mm512_set1_epi32( (1<<31)| (1<<29)| (1<<26) ),
- _mm512_set1_epi32( (1<<31)| (1<<28)| (1<<26) )];
-
- //let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27));
- let answer: [__m512i; 3] = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ),
- _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ),
- _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )];
-
- let answer_mask: __mmask16 = 0b00000001_11111111;
-
- //let coloridx = current_side as usize;
- let coloridx = sd as usize;
- let emptyidx = Color::Empty as usize;
-
- let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3];
-
- for pattern in 0..3 {
-
- for dir in 0..4 {
-
- let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]);
-
- let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]);
-
- let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd
- let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
-
- for i in 1..5 { //only move 4 times
-
- let board2 = _mm512_rol_epi32(board0, i);//rot sd
- let board3 = _mm512_rol_epi32(board1, i);//rot empty
-
- let boardf1 = _mm512_and_epi32(answer_color[pattern], board2);
- let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3);
- let boardf = _mm512_or_epi32(boardf1, boardf2);
-
- temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above
- }
- }
- }
-
- temp_mask
-}
-
-fn check_patternfile5(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..RANK_SIZE {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNFILE4[0];
- let idx1 = sq + PATTERNFILE4[1];
- let idx2 = sq + PATTERNFILE4[2];
- let idx3 = sq + PATTERNFILE4[3];
- let idx4 = sq + PATTERNFILE4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patternrank5(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNRANK4[0];
- let idx1 = sq + PATTERNRANK4[1];
- let idx2 = sq + PATTERNRANK4[2];
- let idx3 = sq + PATTERNRANK4[3];
- let idx4 = sq + PATTERNRANK4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patterndial5(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 0..(FILE_SIZE - 4) {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAL4[0];
- let idx1 = sq + PATTERNDIAL4[1];
- let idx2 = sq + PATTERNDIAL4[2];
- let idx3 = sq + PATTERNDIAL4[3];
- let idx4 = sq + PATTERNDIAL4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn check_patterndiar5(pos: &Pos, sd: Side) -> bool {
-
- for rk in 0..(RANK_SIZE - 4) {
- for fl in 4..FILE_SIZE {
- let sq : Square = square_make(fl, rk);
-
- let idx0 = sq + PATTERNDIAR4[0];
- let idx1 = sq + PATTERNDIAR4[1];
- let idx2 = sq + PATTERNDIAR4[2];
- let idx3 = sq + PATTERNDIAR4[3];
- let idx4 = sq + PATTERNDIAR4[4];
-
- let val0 = pos.state[idx0 as usize];
- let val1 = pos.state[idx1 as usize];
- let val2 = pos.state[idx2 as usize];
- let val3 = pos.state[idx3 as usize];
- let val4 = pos.state[idx4 as usize];
-
- if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true }
- }
- }
-
- false
-}
-
-fn main() {
-
- loop
- {
-
- let start = Instant::now();
-
- println!("Hello, this is connect 6!");
-
- //unsafe { test_avx512(); }
-
- let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize];
-
- let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3];
-
- let mut test1 = Pos {
- state: test_state,
- p_turn: Color::Black,
- p_last: square_make(5,5),
-
- bitboard: test_bitboard,
- };
-
- test1.init();
-
- //pos_disp(&test1);
-
- for i in 0..(FILE_SIZE*RANK_SIZE) {
-
- // println!("----------------------------------------\n\n\n\n");
- // println!("MOVE {}!!!!\n\n\n\n", i);
-
-
- let mut d = 2;
- let mut e = 4;
-
- //if i < 6 { d = 1; e = 2; }
-
- let next_move: Move = search(&test1, d, e);
- //println!("next move is {}", next_move);
- //println!("file is {}", square_file(next_move));
- //println!("rank is {}", square_rank(next_move));
-
- test1.do_move(next_move);
-
- //pos_disp(&test1);
-
- if pos_is_end(&test1) {
-
- println!("Game over!!!!!!");
- println!("MOVE {}!!!!\n", i);
- //pos_disp(&test1);
-
- break; }
- }
-
-
- let duration = start.elapsed();
-
- println!("Time elapsed in expensive_function() is: {:?}", duration);
- }
-
-
-}
From 38c4f1da99c63acfed6985623074331d1184da5c Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 14:28:45 +0000
Subject: [PATCH 16/25] mm_mask_sub: ss,sd; mm_mask_mul: ss,sd; mm_mask_div:
ss,sd
---
crates/core_arch/avx512f.md | 24 +-
crates/core_arch/src/x86/avx512f.rs | 360 ++++++++++++++++++++++++++++
2 files changed, 372 insertions(+), 12 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 01328a43f8..1f0f9c1460 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1252,8 +1252,8 @@
* [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236)
* [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
* [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
- * [ ] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
- * [ ] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
+ * [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
+ * [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
* [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236)
* [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236)
@@ -1296,8 +1296,8 @@
* [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
* [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
* [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
- * [ ] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
- * [ ] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
+ * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
+ * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
* [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
* [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
* [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
@@ -1318,8 +1318,8 @@
* [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
* [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
* [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
- * [ ] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
- * [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
+ * [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
+ * [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
* [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
* [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
* [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
@@ -1330,8 +1330,8 @@
* [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236)
* [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
* [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
- * [ ] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
- * [ ] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
+ * [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
+ * [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
* [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236)
* [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236)
@@ -1374,8 +1374,8 @@
* [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
* [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
* [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
- * [ ] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
- * [ ] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
+ * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
+ * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
* [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
* [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
* [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
@@ -1394,8 +1394,8 @@
* [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
* [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
* [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
- * [ ] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
- * [ ] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
+ * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
+ * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
* [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
* [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
* [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 7804a8ce44..852abfe349 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18487,6 +18487,216 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
transmute(r)
}
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let extractsrc: f32 = simd_extract(src, 0);
+ let mut add: f32 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta - extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss))]
+pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let mut add: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta - extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let extractsrc: f64 = simd_extract(src, 0);
+ let mut add: f64 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta - extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd))]
+pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let mut add: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta - extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let extractsrc: f32 = simd_extract(src, 0);
+ let mut add: f32 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta * extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss))]
+pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let mut add: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta * extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let extractsrc: f64 = simd_extract(src, 0);
+ let mut add: f64 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta * extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd))]
+pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let mut add: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta * extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let extractsrc: f32 = simd_extract(src, 0);
+ let mut add: f32 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta / extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss))]
+pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let mut add: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ add = extracta / extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let extractsrc: f64 = simd_extract(src, 0);
+ let mut add: f64 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta / extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd))]
+pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let mut add: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ add = extracta / extractb;
+ }
+ let r = simd_insert(a, 0, add);
+ transmute(r)
+}
+
/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
@@ -28122,4 +28332,154 @@ mod tests {
let e = _mm_set_pd(1., 6.);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sub_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_sub_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., -20.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sub_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_sub_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_sub_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., -20.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sub_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_sub_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., -2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sub_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_sub_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_sub_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., -2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_mul_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_mul_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 800.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_mul_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_mul_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_mul_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 800.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_mul_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_mul_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_mul_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_mul_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_mul_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_div_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_div_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_div_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_div_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_div_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_div_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_div_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_div_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_div_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_div_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_div_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_div_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
}
From 07a2ee9a1b35d05a6c6adab21c486c4453360a48 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 15:40:40 +0000
Subject: [PATCH 17/25] add_round: ss,sd; sub_round: ss,sd; mul_round: ss,sd;
div_round: ss,sd
---
crates/core_arch/avx512f.md | 44 +-
crates/core_arch/src/x86/avx512f.rs | 1053 +++++++++++++++++++++++++++
crates/core_arch/src/x86/mod.rs | 18 +
3 files changed, 1093 insertions(+), 22 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 1f0f9c1460..cd93fa26b9 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1133,8 +1133,8 @@
* [x] [`_mm512_zextps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextps256_ps512&expand=5236)
* [x] [`_mm512_zextsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi128_si512&expand=5236)
* [x] [`_mm512_zextsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi256_si512&expand=5236)
- * [ ] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236)
- * [ ] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236)
+ * [x] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236)
+ * [x] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236)
* [x] [`_mm_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236)
* [x] [`_mm_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236)
* [x] [`_mm_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236)
@@ -1200,8 +1200,8 @@
* [ ] [`_mm_cvtu32_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=5236)
* [ ] [`_mm_cvtu64_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=5236)
* [ ] [`_mm_cvtu64_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=5236)
- * [ ] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236)
- * [ ] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236)
+ * [x] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236)
+ * [x] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236)
* [ ] [`_mm_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_sd&expand=5236)
* [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236)
@@ -1238,8 +1238,8 @@
* [ ] [`_mm_mask3_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_ss&expand=5236)
* [ ] [`_mm_mask3_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sd&expand=5236)
* [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236)
- * [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
- * [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
+ * [x] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236)
+ * [x] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236)
* [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236)
* [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236)
* [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236)
@@ -1250,8 +1250,8 @@
* [ ] [`_mm_mask_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sd&expand=5236)
* [ ] [`_mm_mask_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_ss&expand=5236)
* [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236)
- * [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
- * [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
+ * [x] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236)
+ * [x] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236)
* [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236)
* [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236)
* [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236)
@@ -1316,20 +1316,20 @@
* [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
* [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236)
* [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
- * [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
- * [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
+ * [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
+ * [x] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236)
* [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236)
* [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236)
- * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
- * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
+ * [x] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236)
+ * [x] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236)
* [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236)
* [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236)
* [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236)
* [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236)
* [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236)
* [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236)
- * [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
- * [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
+ * [x] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236)
+ * [x] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236)
* [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236)
* [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236)
* [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236)
@@ -1372,8 +1372,8 @@
* [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
* [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
* [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
- * [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
- * [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
+ * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
+ * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
* [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
* [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
* [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
@@ -1392,16 +1392,16 @@
* [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
* [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
* [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
- * [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
- * [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
+ * [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
+ * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
* [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
* [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
* [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
* [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
* [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
* [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
- * [ ] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
- * [ ] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
+ * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
+ * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
* [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
* [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
* [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
@@ -1416,7 +1416,7 @@
* [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
* [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
* [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
- * [ ] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
- * [ ] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
+ * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
+ * [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 852abfe349..4294ec92b5 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,722 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
transmute(r)
}
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_mul_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_mul_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_mul_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmulsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_div_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_div_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_div_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vdivsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
@@ -19299,6 +20015,23 @@ extern "C" {
fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
#[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
+
+ #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
+ fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
+ fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
+ fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
+ fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
+ fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
+ fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
+ fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
+ fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
}
#[cfg(test)]
@@ -28482,4 +29215,324 @@ mod tests {
let e = _mm_set_pd(1., 0.5);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_add_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 60.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_add_round_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_add_round_ss(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 2., 10., 60.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_add_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 60.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_add_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 6.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_add_round_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_add_round_sd(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 6.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_add_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 6.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_sub_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., -20.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sub_round_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_sub_round_ss(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 2., 10., -20.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sub_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., -20.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_sub_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., -2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sub_round_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_sub_round_sd(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., -2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sub_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., -2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mul_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 800.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_mul_round_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_mul_round_ss(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 2., 10., 800.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_mul_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 800.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mul_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_mul_round_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_mul_round_sd(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_mul_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_div_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_div_round_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_div_round_ss(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_div_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_div_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_div_round_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_div_round_sd(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_div_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
}
diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs
index 41cbd5029a..80dad4d64e 100644
--- a/crates/core_arch/src/x86/mod.rs
+++ b/crates/core_arch/src/x86/mod.rs
@@ -451,6 +451,24 @@ impl m128Ext for __m128 {
}
}
+#[allow(non_camel_case_types)]
+#[unstable(feature = "stdsimd_internal", issue = "none")]
+pub(crate) trait m128dExt: Sized {
+ fn as_m128d(self) -> __m128d;
+
+ #[inline]
+ fn as_f64x2(self) -> crate::core_arch::simd::f64x2 {
+ unsafe { transmute(self.as_m128d()) }
+ }
+}
+
+impl m128dExt for __m128d {
+ #[inline]
+ fn as_m128d(self) -> Self {
+ self
+ }
+}
+
#[allow(non_camel_case_types)]
#[unstable(feature = "stdsimd_internal", issue = "none")]
pub(crate) trait m256Ext: Sized {
From 11b2b65badfcba141800a9486ade247a6768bb23 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 16:06:53 +0000
Subject: [PATCH 18/25] mask_sqrt: ss,sd; sqrt_round: ss,sd;
---
crates/core_arch/avx512f.md | 20 +-
crates/core_arch/src/x86/avx512f.rs | 377 ++++++++++++++++++++++++++++
2 files changed, 387 insertions(+), 10 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index cd93fa26b9..6224ef09c2 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1310,10 +1310,10 @@
* [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
* [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
* [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
- * [ ] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
- * [ ] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
- * [ ] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
- * [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
+ * [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
+ * [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
+ * [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
+ * [x] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236)
* [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236)
* [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236)
* [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236)
@@ -1388,10 +1388,10 @@
* [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
* [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
* [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
- * [ ] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
- * [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
- * [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
- * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
+ * [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
+ * [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
+ * [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
+ * [x] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236)
* [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236)
* [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
* [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
@@ -1414,8 +1414,8 @@
* [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
* [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
* [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
- * [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
- * [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
+ * [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
+ * [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
* [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
* [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 4294ec92b5..6ec24d2a00 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,70 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
transmute(r)
}
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vsqrtss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss))]
+pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vsqrtss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vsqrtsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd))]
+pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vsqrtsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ ))
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -19413,6 +19477,185 @@ pub unsafe fn _mm_maskz_div_round_sd(
transmute(constify_imm4_round!(rounding, call))
}
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sqrt_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sqrt_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sqrt_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsqrtsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
@@ -20032,6 +20275,10 @@ extern "C" {
fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.div.sd.round"]
fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
+ fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
+ fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
}
#[cfg(test)]
@@ -29216,6 +29463,56 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sqrt_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_mask_sqrt_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 2.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sqrt_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_maskz_sqrt_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 2.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sqrt_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_sqrt_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sqrt_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_sqrt_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 2.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_add_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
@@ -29535,4 +29832,84 @@ mod tests {
let e = _mm_set_pd(1., 0.5);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_sqrt_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 2.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sqrt_round_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_sqrt_round_ss(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 2., 10., 2.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sqrt_round_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 2., 10., 2.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_sqrt_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_sqrt_round_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_sqrt_round_sd(
+ src,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 2.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_sqrt_round_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 2.);
+ assert_eq_m128d(r, e);
+ }
}
From 1afd1b92d9b2f588338dd9e1fdf8e66ab8841d01 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 16:37:19 +0000
Subject: [PATCH 19/25] mask_max: ss,sd; mask_min: ss,sd; max_round: ss,sd;
min_round: ss,sd;
---
crates/core_arch/avx512f.md | 44 +-
crates/core_arch/src/x86/avx512f.rs | 640 ++++++++++++++++++++++++++++
2 files changed, 662 insertions(+), 22 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6224ef09c2..701b65d560 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1284,18 +1284,18 @@
* [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
* [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236)
* [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236)
- * [ ] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
- * [ ] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236)
- * [ ] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236)
- * [ ] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236)
- * [ ] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236)
- * [ ] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
- * [ ] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
- * [ ] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
+ * [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
+ * [x] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236)
+ * [x] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236)
+ * [x] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236)
+ * [x] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236)
+ * [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
+ * [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
+ * [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
* [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
* [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
- * [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
- * [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
+ * [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
+ * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
* [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
* [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
* [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
@@ -1362,14 +1362,14 @@
* [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
* [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236)
* [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236)
- * [ ] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
- * [ ] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236)
- * [ ] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236)
- * [ ] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236)
- * [ ] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236)
- * [ ] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
- * [ ] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
- * [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
+ * [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
+ * [x] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236)
+ * [x] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236)
+ * [x] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236)
+ * [x] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236)
+ * [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
+ * [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
+ * [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
* [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
* [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
* [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
@@ -1396,10 +1396,10 @@
* [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236)
* [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236)
* [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236)
- * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
- * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
- * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
- * [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
+ * [x] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236)
+ * [x] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236)
+ * [x] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236)
+ * [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
* [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
* [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
* [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 6ec24d2a00..185fba00e9 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18697,6 +18697,134 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
transmute(r)
}
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vmaxss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss))]
+pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vmaxss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vmaxsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd))]
+pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vmaxsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vminss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss))]
+pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vminss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vminsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd))]
+pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vminsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
@@ -19477,6 +19605,282 @@ pub unsafe fn _mm_maskz_div_round_sd(
transmute(constify_imm4_round!(rounding, call))
}
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_max_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ sae: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_max_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vmaxsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_min_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ sae: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_min_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vminsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20275,6 +20679,14 @@ extern "C" {
fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.div.sd.round"]
fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
+ fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
+ fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
+ fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
+ fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
#[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
@@ -29463,6 +29875,102 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_max_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_mask_max_ss(a, 0, a, b);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_max_ss(a, 0b11111111, a, b);
+ let e = _mm_set_ps(0., 1., 2., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_max_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_maskz_max_ss(0, a, b);
+ let e = _mm_set_ps(0., 1., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_max_ss(0b11111111, a, b);
+ let e = _mm_set_ps(0., 1., 2., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_max_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_mask_max_sd(a, 0, a, b);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_max_sd(a, 0b11111111, a, b);
+ let e = _mm_set_pd(0., 3.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_max_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_maskz_max_sd(0, a, b);
+ let e = _mm_set_pd(0., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_max_sd(0b11111111, a, b);
+ let e = _mm_set_pd(0., 3.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_min_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_mask_min_ss(a, 0, a, b);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_min_ss(a, 0b11111111, a, b);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_min_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_maskz_min_ss(0, a, b);
+ let e = _mm_set_ps(0., 1., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_min_ss(0b11111111, a, b);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_min_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_mask_min_sd(a, 0, a, b);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_min_sd(a, 0b11111111, a, b);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_min_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_maskz_min_sd(0, a, b);
+ let e = _mm_set_pd(0., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_min_sd(0b11111111, a, b);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_mask_sqrt_ss() {
let src = _mm_set_ps(10., 11., 100., 110.);
@@ -29833,6 +30341,138 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_max_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_max_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_max_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 7.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_max_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 3.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_max_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 3.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_max_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 3.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_min_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_min_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_min_round_ss() {
+ let a = _mm_set_ps(0., 1., 2., 3.);
+ let b = _mm_set_ps(4., 5., 6., 7.);
+ let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(0., 1., 2., 3.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_min_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_min_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_min_round_sd() {
+ let a = _mm_set_pd(0., 1.);
+ let b = _mm_set_pd(2., 3.);
+ let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(0., 1.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_sqrt_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
From eb58b50a418d547b0a1c518641568e0fa6aaf1fa Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 17:20:07 +0000
Subject: [PATCH 20/25] rsqrt14: ss,sd; rcp14: ss,sd
---
crates/core_arch/avx512f.md | 24 +--
crates/core_arch/src/x86/avx512f.rs | 305 ++++++++++++++++++++++++++++
2 files changed, 317 insertions(+), 12 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 701b65d560..6483ea7460 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1298,14 +1298,14 @@
* [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
* [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
* [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
- * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
- * [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
+ * [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
+ * [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
* [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
* [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
* [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
* [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
- * [ ] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
- * [ ] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
+ * [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
+ * [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
* [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
* [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
* [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
@@ -1376,14 +1376,14 @@
* [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
* [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
* [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
- * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
- * [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
+ * [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
+ * [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
* [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
* [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
* [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
* [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
- * [ ] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
- * [ ] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
+ * [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
+ * [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
* [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
* [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
* [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
@@ -1402,14 +1402,14 @@
* [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236)
* [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236)
* [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
- * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
- * [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
+ * [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
+ * [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
* [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
* [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
* [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
* [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
- * [ ] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
- * [ ] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
+ * [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
+ * [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
* [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
* [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
* [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 185fba00e9..8c93b318dc 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18889,6 +18889,166 @@ pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
))
}
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
+ transmute(vrsqrt14ss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ ))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14ss))]
+pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vrsqrt14ss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrsqrt14sd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ ))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrsqrt14sd))]
+pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrsqrt14sd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
+ transmute(vrcp14ss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ ))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
+}
+
+/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14ss))]
+pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vrcp14ss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrcp14sd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ ))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
+}
+
+/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrcp14sd))]
+pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vrcp14sd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ ))
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20691,6 +20851,15 @@ extern "C" {
fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+ #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
+ fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+ #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
+ fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+ #[link_name = "llvm.x86.avx512.rcp14.ss"]
+ fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
+ #[link_name = "llvm.x86.avx512.rcp14.sd"]
+ fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
}
#[cfg(test)]
@@ -30021,6 +30190,142 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_rsqrt14_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_rsqrt14_ss(a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_rsqrt14_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_rsqrt14_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_maskz_rsqrt14_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.5);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_rsqrt14_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_rsqrt14_sd(a, b);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_rsqrt14_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_rsqrt14_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_rsqrt14_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.5);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_rcp14_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_rcp14_ss(a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_rcp14_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_mask_rcp14_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_rcp14_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 4.);
+ let r = _mm_maskz_rcp14_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_rcp14_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_rcp14_sd(a, b);
+ let e = _mm_set_pd(1., 0.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_rcp14_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_rcp14_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_rcp14_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_rcp14_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 0.25);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_add_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
From b8a68500fcd74c239820a35023ccfab809ed9f99 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 18:08:10 +0000
Subject: [PATCH 21/25] getexp: ss,sd; getexp_round: ss,sd; getmant: ss,sd;
getmant_round: ss,sd;
---
crates/core_arch/avx512f.md | 48 +-
crates/core_arch/src/x86/avx512f.rs | 1896 +++++++++++++++++++++------
2 files changed, 1507 insertions(+), 437 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 6483ea7460..5b2aaa1c21 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1214,14 +1214,14 @@
* [ ] [`_mm_fnmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_ss&expand=5236)
* [ ] [`_mm_fnmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sd&expand=5236)
* [ ] [`_mm_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_ss&expand=5236)
- * [ ] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236)
- * [ ] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236)
- * [ ] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236)
- * [ ] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236)
- * [ ] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236)
- * [ ] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
- * [ ] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
- * [ ] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
+ * [x] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236)
+ * [x] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236)
+ * [x] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236)
+ * [x] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236)
+ * [x] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236)
+ * [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
+ * [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
+ * [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
* [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
* [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
* [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
@@ -1274,14 +1274,14 @@
* [ ] [`_mm_mask_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_ss&expand=5236)
* [ ] [`_mm_mask_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sd&expand=5236)
* [ ] [`_mm_mask_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ss&expand=5236)
- * [ ] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236)
- * [ ] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236)
- * [ ] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236)
- * [ ] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236)
- * [ ] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236)
- * [ ] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236)
- * [ ] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236)
- * [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
+ * [x] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236)
+ * [x] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236)
+ * [x] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236)
+ * [x] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236)
+ * [x] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236)
+ * [x] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236)
+ * [x] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236)
+ * [x] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236)
* [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236)
* [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236)
* [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236)
@@ -1352,14 +1352,14 @@
* [ ] [`_mm_maskz_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_ss&expand=5236)
* [ ] [`_mm_maskz_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sd&expand=5236)
* [ ] [`_mm_maskz_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ss&expand=5236)
- * [ ] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236)
- * [ ] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236)
- * [ ] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236)
- * [ ] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236)
- * [ ] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236)
- * [ ] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236)
- * [ ] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236)
- * [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
+ * [x] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236)
+ * [x] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236)
+ * [x] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236)
+ * [x] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236)
+ * [x] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236)
+ * [x] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236)
+ * [x] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236)
+ * [x] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236)
* [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236)
* [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236)
* [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 8c93b318dc..24b6245e17 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19049,6 +19049,342 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
))
}
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
+ transmute(vgetexpss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vgetexpss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpss))]
+pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vgetexpss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
+ transmute(vgetexpsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vgetexpsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetexpsd))]
+pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vgetexpsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_NO_EXC,
+ ))
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_getmant_ss(
+ a: __m128,
+ b: __m128,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_getmant_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_getmant_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_getmant_sd(
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_getmant_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_getmant_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas!(norm, sign, call);
+ transmute(r)
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -19058,15 +19394,373 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_add_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_add_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vaddsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_sub_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_sub_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vsubsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ transmute(constify_imm4_round!(rounding, call))
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vaddss(
+ vmulss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19078,7 +19772,7 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
transmute(constify_imm4_round!(rounding, call))
}
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19087,12 +19781,12 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_ss(
+pub unsafe fn _mm_mask_mul_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
@@ -19101,13 +19795,13 @@ pub unsafe fn _mm_mask_add_round_ss(
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
transmute(constify_imm4_round!(rounding, call))
}
-/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19116,15 +19810,15 @@ pub unsafe fn _mm_mask_add_round_ss(
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vaddss(
+ vmulss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19136,7 +19830,7 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
transmute(constify_imm4_round!(rounding, call))
}
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19145,15 +19839,15 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vaddsd(
+ vmulsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19165,7 +19859,7 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
transmute(constify_imm4_round!(rounding, call))
}
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19174,12 +19868,12 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_add_round_sd(
+pub unsafe fn _mm_mask_mul_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
@@ -19188,13 +19882,13 @@ pub unsafe fn _mm_mask_add_round_sd(
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
transmute(constify_imm4_round!(rounding, call))
}
-/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19203,12 +19897,12 @@ pub unsafe fn _mm_mask_add_round_sd(
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_add_round_sd(
+pub unsafe fn _mm_maskz_mul_round_sd(
k: __mmask8,
a: __m128d,
b: __m128d,
@@ -19216,7 +19910,7 @@ pub unsafe fn _mm_maskz_add_round_sd(
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vaddsd(
+ vmulsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19228,7 +19922,7 @@ pub unsafe fn _mm_maskz_add_round_sd(
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19237,15 +19931,15 @@ pub unsafe fn _mm_maskz_add_round_sd(
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vsubss(
+ vdivss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19257,7 +19951,7 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19266,12 +19960,12 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_ss(
+pub unsafe fn _mm_mask_div_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
@@ -19280,13 +19974,13 @@ pub unsafe fn _mm_mask_sub_round_ss(
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19295,15 +19989,15 @@ pub unsafe fn _mm_mask_sub_round_ss(
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vsubss(
+ vdivss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19315,7 +20009,7 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19324,15 +20018,15 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vsubsd(
+ vdivsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19344,7 +20038,7 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19353,12 +20047,12 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sub_round_sd(
+pub unsafe fn _mm_mask_div_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
@@ -19367,13 +20061,13 @@ pub unsafe fn _mm_mask_sub_round_sd(
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
transmute(constify_imm4_round!(rounding, call))
}
-/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
@@ -19382,12 +20076,12 @@ pub unsafe fn _mm_mask_sub_round_sd(
/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sub_round_sd(
+pub unsafe fn _mm_maskz_div_round_sd(
k: __mmask8,
a: __m128d,
b: __m128d,
@@ -19395,7 +20089,7 @@ pub unsafe fn _mm_maskz_sub_round_sd(
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vsubsd(
+ vdivsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19407,24 +20101,18 @@ pub unsafe fn _mm_maskz_sub_round_sd(
transmute(constify_imm4_round!(rounding, call))
}
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmulss(
+ vmaxss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19433,56 +20121,44 @@ pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_ss(
+pub unsafe fn _mm_mask_max_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
b: __m128,
- rounding: i32,
+ sae: i32,
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmulss(
+ vmaxss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19491,27 +20167,21 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmulsd(
+ vmaxsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19519,62 +20189,45 @@ pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
$imm4,
)
};
- }
- transmute(constify_imm4_round!(rounding, call))
-}
-
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+ }
+ transmute(constify_imm4_sae!(sae, call))
+}
+
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_mul_round_sd(
+pub unsafe fn _mm_mask_max_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
b: __m128d,
- rounding: i32,
+ sae: i32,
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_mul_round_sd(
- k: __mmask8,
- a: __m128d,
- b: __m128d,
- rounding: i32,
-) -> __m128d {
+pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmulsd(
+ vmaxsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19583,27 +20236,21 @@ pub unsafe fn _mm_maskz_mul_round_sd(
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vdivss(
+ vminss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19612,56 +20259,44 @@ pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_ss(
+pub unsafe fn _mm_mask_min_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
b: __m128,
- rounding: i32,
+ sae: i32,
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminss, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vdivss(
+ vminss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19670,27 +20305,21 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vdivsd(
+ vminsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19699,61 +20328,44 @@ pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_div_round_sd(
+pub unsafe fn _mm_mask_min_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
b: __m128d,
- rounding: i32,
+ sae: i32,
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
+#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_div_round_sd(
- k: __mmask8,
- a: __m128d,
- b: __m128d,
- rounding: i32,
-) -> __m128d {
+pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vdivsd(
+ vminsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19762,21 +20374,27 @@ pub unsafe fn _mm_maskz_div_round_sd(
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ transmute(constify_imm4_sae!(sae, call))
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmaxss(
+ vsqrtss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19785,44 +20403,56 @@ pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_ss(
+pub unsafe fn _mm_mask_sqrt_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
b: __m128,
- sae: i32,
+ rounding: i32,
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vmaxss(
+ vsqrtss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19831,21 +20461,27 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmaxsd(
+ vsqrtsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19854,44 +20490,61 @@ pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_max_round_sd(
+pub unsafe fn _mm_mask_sqrt_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
b: __m128d,
- sae: i32,
+ rounding: i32,
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_maskz_sqrt_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ rounding: i32,
+) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vmaxsd(
+ vsqrtsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19900,21 +20553,21 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ transmute(constify_imm4_round!(rounding, call))
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vminss(
+ vgetexpss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19923,18 +20576,19 @@ pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_ss(
+pub unsafe fn _mm_mask_getexp_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
@@ -19943,24 +20597,25 @@ pub unsafe fn _mm_mask_min_round_ss(
) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ vgetexpss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminss, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
+pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
macro_rules! call {
($imm4:expr) => {
- vminss(
+ vgetexpss(
a.as_f32x4(),
b.as_f32x4(),
_mm_setzero_ps().as_f32x4(),
@@ -19969,21 +20624,22 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
#[rustc_args_required_const(2)]
-pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vminsd(
+ vgetexpsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -19992,18 +20648,19 @@ pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_min_round_sd(
+pub unsafe fn _mm_mask_getexp_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
@@ -20012,24 +20669,25 @@ pub unsafe fn _mm_mask_min_round_sd(
) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ vgetexpsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vminsd, sae = 8))]
+#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
+pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
macro_rules! call {
($imm4:expr) => {
- vminsd(
+ vgetexpsd(
a.as_f64x2(),
b.as_f64x2(),
_mm_setzero_pd().as_f64x2(),
@@ -20038,186 +20696,254 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i
)
};
}
- transmute(constify_imm4_sae!(sae, call))
+ let r = constify_imm4_sae!(sae, call);
+ transmute(r)
}
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_ss(
+ a: __m128,
+ b: __m128,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128 {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtss(
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantss(
a.as_f32x4(),
b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
_mm_setzero_ps().as_f32x4(),
0b1,
- $imm4,
+ $imm4_2,
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
}
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_ss(
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_ss(
src: __m128,
k: __mmask8,
a: __m128,
b: __m128,
- rounding: i32,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
) -> __m128 {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
+ src.as_f32x4(),
+ k,
+ $imm4_2,
+ )
};
}
- transmute(constify_imm4_round!(rounding, call))
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
}
-/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
+#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128 {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtss(
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantss(
a.as_f32x4(),
b.as_f32x4(),
+ $imm2 << 2 | $imm4_1,
_mm_setzero_ps().as_f32x4(),
k,
- $imm4,
+ $imm4_2,
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
}
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(2)]
-pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_sd(
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128d {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtsd(
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
a.as_f64x2(),
b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
_mm_setzero_pd().as_f64x2(),
0b1,
- $imm4,
+ $imm4_2,
)
- };
- }
- transmute(constify_imm4_round!(rounding, call))
-}
-
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+ };
+ }
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(4)]
-pub unsafe fn _mm_mask_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
b: __m128d,
- rounding: i32,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
) -> __m128d {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ src.as_f64x2(),
+ k,
+ $imm4_2,
+ )
};
}
- transmute(constify_imm4_round!(rounding, call))
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
}
-/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
-///
-/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
-/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
-/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
-/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
-/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
-/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
-#[rustc_args_required_const(3)]
-pub unsafe fn _mm_maskz_sqrt_round_sd(
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_sd(
k: __mmask8,
a: __m128d,
b: __m128d,
- rounding: i32,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
) -> __m128d {
macro_rules! call {
- ($imm4:expr) => {
- vsqrtsd(
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
a.as_f64x2(),
b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
_mm_setzero_pd().as_f64x2(),
k,
- $imm4,
+ $imm4_2,
)
};
}
- transmute(constify_imm4_round!(rounding, call))
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
}
/// Equal
@@ -20851,6 +21577,14 @@ extern "C" {
fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
+ fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
+ fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
+ fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
+ fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
#[link_name = "llvm.x86.avx512.rsqrt14.ss"]
fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
@@ -30326,6 +31060,138 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getexp_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_getexp_ss(a, b);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getexp_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_mask_getexp_ss(a, 0, a, b);
+ let e = _mm_set_ps(2., 2., 2., 2.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getexp_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_maskz_getexp_ss(0, a, b);
+ let e = _mm_set_ps(2., 2., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_getexp_ss(0b11111111, a, b);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getexp_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_getexp_sd(a, b);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getexp_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_mask_getexp_sd(a, 0, a, b);
+ let e = _mm_set_pd(2., 2.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getexp_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_maskz_getexp_sd(0, a, b);
+ let e = _mm_set_pd(2., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_getexp_sd(0b11111111, a, b);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getmant_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_getmant_ss(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getmant_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_mask_getmant_ss(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_ps(20., 20., 20., 20.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_getmant_ss(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getmant_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_maskz_getmant_ss(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_ps(20., 20., 20., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_getmant_ss(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getmant_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_getmant_sd(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getmant_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_mask_getmant_sd(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_pd(20., 20.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_getmant_sd(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getmant_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_maskz_getmant_sd(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_pd(20., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_getmant_sd(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_add_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
@@ -30857,4 +31723,208 @@ mod tests {
let e = _mm_set_pd(1., 2.);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getexp_round_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getexp_round_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2., 2., 2., 2.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getexp_round_ss() {
+ let a = _mm_set1_ps(2.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2., 2., 2., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2., 2., 2., 1.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getexp_round_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getexp_round_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2., 2.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getexp_round_sd() {
+ let a = _mm_set1_pd(2.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2., 1.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getmant_round_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_getmant_round_ss(
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getmant_round_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_mask_getmant_round_ss(
+ a,
+ 0,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_ps(20., 20., 20., 20.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_getmant_round_ss(
+ a,
+ 0b11111111,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getmant_round_ss() {
+ let a = _mm_set1_ps(20.);
+ let b = _mm_set1_ps(10.);
+ let r = _mm_maskz_getmant_round_ss(
+ 0,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_ps(20., 20., 20., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_getmant_round_ss(
+ 0b11111111,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_ps(20., 20., 20., 1.25);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_getmant_round_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_getmant_round_sd(
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_getmant_round_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_mask_getmant_round_sd(
+ a,
+ 0,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_pd(20., 20.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_getmant_round_sd(
+ a,
+ 0b11111111,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_getmant_round_sd() {
+ let a = _mm_set1_pd(20.);
+ let b = _mm_set1_pd(10.);
+ let r = _mm_maskz_getmant_round_sd(
+ 0,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_pd(20., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_getmant_round_sd(
+ 0b11111111,
+ a,
+ b,
+ _MM_MANT_NORM_1_2,
+ _MM_MANT_SIGN_SRC,
+ _MM_FROUND_CUR_DIRECTION,
+ );
+ let e = _mm_set_pd(20., 1.25);
+ assert_eq_m128d(r, e);
+ }
}
From 16d6dd60abb7f542b5f5d00a6541b47ee2268d10 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 20:24:22 +0000
Subject: [PATCH 22/25] roundscale: ss,sd; roundscale_round: ss,sd; scalef:
ss,sd; scalef_round: ss,sd;
---
crates/core_arch/avx512f.md | 48 +-
crates/core_arch/src/x86/avx512f.rs | 1107 +++++++++++++++++++++++++--
2 files changed, 1064 insertions(+), 91 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 5b2aaa1c21..052395b275 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1300,16 +1300,16 @@
* [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236)
* [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236)
* [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236)
- * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
- * [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
- * [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
- * [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
+ * [x] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236)
+ * [x] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236)
+ * [x] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236)
+ * [x] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236)
* [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236)
* [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236)
- * [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
- * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
- * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
- * [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
+ * [x] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236)
+ * [x] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236)
+ * [x] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236)
+ * [x] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236)
* [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236)
* [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236)
* [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236)
@@ -1378,16 +1378,16 @@
* [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236)
* [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236)
* [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236)
- * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
- * [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
- * [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
- * [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
+ * [x] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236)
+ * [x] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236)
+ * [x] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236)
+ * [x] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236)
* [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236)
* [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236)
- * [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
- * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
- * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
- * [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
+ * [x] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236)
+ * [x] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236)
+ * [x] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236)
+ * [x] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236)
* [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236)
* [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236)
* [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236)
@@ -1404,16 +1404,16 @@
* [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236)
* [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236)
* [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236)
- * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
- * [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
- * [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
- * [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
+ * [x] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236)
+ * [x] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236)
+ * [x] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236)
+ * [x] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236)
* [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236)
* [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236)
- * [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
- * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
- * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
- * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
+ * [x] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236)
+ * [x] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236)
+ * [x] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236)
+ * [x] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236)
* [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236)
* [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236)
* [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 24b6245e17..a7e298c550 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19385,6 +19385,294 @@ pub unsafe fn _mm_maskz_getmant_sd(
transmute(r)
}
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 255))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaless(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b11111111,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_roundscale_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ imm8: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaless(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_roundscale_ss(k: __mmask8, a: __m128, b: __m128, imm8: i32) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscaless(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 255))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b11111111,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_roundscale_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ imm8: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_roundscale_sd(k: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr) => {
+ vrndscalesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm8,
+ _MM_FROUND_CUR_DIRECTION,
+ )
+ };
+ }
+ let r = constify_imm8_sae!(imm8, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
+ transmute(vscalefss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vscalefss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ src.as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss))]
+pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ transmute(vscalefss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
+ transmute(vscalefsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b11111111,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vscalefsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ src.as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefsd))]
+pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ transmute(vscalefsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ _MM_FROUND_CUR_DIRECTION,
+ ))
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -20835,114 +21123,504 @@ pub unsafe fn _mm_maskz_getmant_round_ss(
/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(2, 3, 4)]
+pub unsafe fn _mm_getmant_round_sd(
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_pd().as_f64x2(),
+ 0b1,
+ $imm4_2,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(4, 5, 6)]
+pub unsafe fn _mm_mask_getmant_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ src.as_f64x2(),
+ k,
+ $imm4_2,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
+}
+
+/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
+/// The mantissa is normalized to the interval specified by interv, which can take the following values:
+/// _MM_MANT_NORM_1_2 // interval [1, 2)
+/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
+/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
+/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
+/// The sign is determined by sc which can take the following values:
+/// _MM_MANT_SIGN_src // sign = sign(src)
+/// _MM_MANT_SIGN_zero // sign = 0
+/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
+#[rustc_args_required_const(3, 4, 5)]
+pub unsafe fn _mm_maskz_getmant_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ norm: _MM_MANTISSA_NORM_ENUM,
+ sign: _MM_MANTISSA_SIGN_ENUM,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
+ vgetmantsd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ $imm2 << 2 | $imm4_1,
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm4_2,
+ )
+ };
+ }
+ let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_roundscale_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaless(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b11111111,
+ $imm8,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ imm8: i32,
+ sae: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaless(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm8, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ imm8: i32,
+ sae: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscaless(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm8,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(2, 3)]
+pub unsafe fn _mm_roundscale_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ 0b11111111,
+ $imm8,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(4, 5)]
+pub unsafe fn _mm_mask_roundscale_round_sd(
+ src: __m128d,
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ imm8: i32,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalesd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm8, $imm4)
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+/// Rounding is done according to the imm8[2:0] parameter, which can be one of:
+/// _MM_FROUND_TO_NEAREST_INT // round to nearest
+/// _MM_FROUND_TO_NEG_INF // round down
+/// _MM_FROUND_TO_POS_INF // round up
+/// _MM_FROUND_TO_ZERO // truncate
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
+#[rustc_args_required_const(3, 4)]
+pub unsafe fn _mm_maskz_roundscale_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ imm8: i32,
+ sae: i32,
+) -> __m128d {
+ macro_rules! call {
+ ($imm8:expr, $imm4:expr) => {
+ vrndscalesd(
+ a.as_f64x2(),
+ b.as_f64x2(),
+ _mm_setzero_pd().as_f64x2(),
+ k,
+ $imm8,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm8_roundscale!(imm8, sae, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ 0b11111111,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_scalef_round_ss(
+ src: __m128,
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_scalef_round_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ rounding: i32,
+) -> __m128 {
+ macro_rules! call {
+ ($imm4:expr) => {
+ vscalefss(
+ a.as_f32x4(),
+ b.as_f32x4(),
+ _mm_setzero_ps().as_f32x4(),
+ k,
+ $imm4,
+ )
+ };
+ }
+ let r = constify_imm4_round!(rounding, call);
+ transmute(r)
+}
+
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(2, 3, 4)]
-pub unsafe fn _mm_getmant_round_sd(
- a: __m128d,
- b: __m128d,
- norm: _MM_MANTISSA_NORM_ENUM,
- sign: _MM_MANTISSA_SIGN_ENUM,
- sae: i32,
-) -> __m128d {
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(2)]
+pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
macro_rules! call {
- ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
- vgetmantsd(
+ ($imm4:expr) => {
+ vscalefsd(
a.as_f64x2(),
b.as_f64x2(),
- $imm2 << 2 | $imm4_1,
_mm_setzero_pd().as_f64x2(),
- 0b1,
- $imm4_2,
+ 0b11111111,
+ $imm4,
)
};
}
- let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ let r = constify_imm4_round!(rounding, call);
transmute(r)
}
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-/// _MM_MANT_NORM_1_2 // interval [1, 2)
-/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
-/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
-/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-/// _MM_MANT_SIGN_src // sign = sign(src)
-/// _MM_MANT_SIGN_zero // sign = 0
-/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(4, 5, 6)]
-pub unsafe fn _mm_mask_getmant_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_scalef_round_sd(
src: __m128d,
k: __mmask8,
a: __m128d,
b: __m128d,
- norm: _MM_MANTISSA_NORM_ENUM,
- sign: _MM_MANTISSA_SIGN_ENUM,
- sae: i32,
+ rounding: i32,
) -> __m128d {
macro_rules! call {
- ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
- vgetmantsd(
- a.as_f64x2(),
- b.as_f64x2(),
- $imm2 << 2 | $imm4_1,
- src.as_f64x2(),
- k,
- $imm4_2,
- )
+ ($imm4:expr) => {
+ vscalefsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
};
}
- let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ let r = constify_imm4_round!(rounding, call);
transmute(r)
}
-/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-/// The mantissa is normalized to the interval specified by interv, which can take the following values:
-/// _MM_MANT_NORM_1_2 // interval [1, 2)
-/// _MM_MANT_NORM_p5_2 // interval [0.5, 2)
-/// _MM_MANT_NORM_p5_1 // interval [0.5, 1)
-/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
-/// The sign is determined by sc which can take the following values:
-/// _MM_MANT_SIGN_src // sign = sign(src)
-/// _MM_MANT_SIGN_zero // sign = 0
-/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
-/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
+/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
-/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
#[inline]
#[target_feature(enable = "avx512f")]
-#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
-#[rustc_args_required_const(3, 4, 5)]
-pub unsafe fn _mm_maskz_getmant_round_sd(
+#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_maskz_scalef_round_sd(
k: __mmask8,
a: __m128d,
b: __m128d,
- norm: _MM_MANTISSA_NORM_ENUM,
- sign: _MM_MANTISSA_SIGN_ENUM,
- sae: i32,
+ rounding: i32,
) -> __m128d {
macro_rules! call {
- ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
- vgetmantsd(
+ ($imm4:expr) => {
+ vscalefsd(
a.as_f64x2(),
b.as_f64x2(),
- $imm2 << 2 | $imm4_1,
_mm_setzero_pd().as_f64x2(),
k,
- $imm4_2,
+ $imm4,
)
};
}
- let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
+ let r = constify_imm4_round!(rounding, call);
transmute(r)
}
@@ -21594,6 +22272,15 @@ extern "C" {
fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
#[link_name = "llvm.x86.avx512.rcp14.sd"]
fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
+
+ #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
+ fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
+ fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
+ #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
+ fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
+ #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
+ fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
}
#[cfg(test)]
@@ -31192,6 +31879,138 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_roundscale_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_roundscale_ss(a, b, 0);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_roundscale_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_mask_roundscale_ss(a, 0, a, b, 0);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_roundscale_ss(a, 0b11111111, a, b, 0);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_roundscale_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_maskz_roundscale_ss(0, a, b, 0);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_roundscale_ss(0b11111111, a, b, 0);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_roundscale_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_roundscale_sd(a, b, 0);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_roundscale_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_mask_roundscale_sd(a, 0, a, b, 0);
+ let e = _mm_set_pd(2.2, 2.2);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_roundscale_sd(a, 0b11111111, a, b, 0);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_roundscale_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_maskz_roundscale_sd(0, a, b, 0);
+ let e = _mm_set_pd(2.2, 0.0);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_roundscale_sd(0b11111111, a, b, 0);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_scalef_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_scalef_ss(a, b);
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_scalef_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_mask_scalef_ss(a, 0, a, b);
+ let e = _mm_set_ps(1., 1., 1., 1.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_scalef_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_maskz_scalef_ss(0, a, b);
+ let e = _mm_set_ps(1., 1., 1., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_scalef_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_scalef_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_scalef_sd(a, b);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_scalef_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_mask_scalef_sd(a, 0, a, b);
+ let e = _mm_set_pd(1., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_scalef_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_maskz_scalef_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_scalef_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_add_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
@@ -31927,4 +32746,158 @@ mod tests {
let e = _mm_set_pd(20., 1.25);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_roundscale_round_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_roundscale_round_ss(a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_roundscale_round_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_mask_roundscale_round_ss(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_roundscale_round_ss(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_roundscale_round_ss() {
+ let a = _mm_set1_ps(2.2);
+ let b = _mm_set1_ps(1.1);
+ let r = _mm_maskz_roundscale_round_ss(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_roundscale_round_ss(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_roundscale_round_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_roundscale_round_sd(a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_roundscale_round_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_mask_roundscale_round_sd(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2.2, 2.2);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_roundscale_round_sd(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_roundscale_round_sd() {
+ let a = _mm_set1_pd(2.2);
+ let b = _mm_set1_pd(1.1);
+ let r = _mm_maskz_roundscale_round_sd(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2.2, 0.0);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_roundscale_round_sd(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
+ let e = _mm_set_pd(2.2, 1.0);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_scalef_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_scalef_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 1., 1., 1.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_scalef_round_ss(
+ a,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_scalef_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(3.);
+ let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 1., 1., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_scalef_round_ss(
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 1., 1., 8.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_scalef_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_scalef_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 1.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_scalef_round_sd(
+ a,
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_scalef_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(3.);
+ let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_scalef_round_sd(
+ 0b11111111,
+ a,
+ b,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 8.);
+ assert_eq_m128d(r, e);
+ }
}
From 8052b6430474af564f9ee40e25f58ca9384e8614 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Thu, 15 Oct 2020 23:58:17 +0000
Subject: [PATCH 23/25] mask_move: ss,sd
---
crates/core_arch/avx512f.md | 8 +-
crates/core_arch/src/x86/avx512f.rs | 112 ++++++++++++++++++++++++++++
2 files changed, 116 insertions(+), 4 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index 052395b275..ae452723b0 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1292,8 +1292,8 @@
* [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236)
* [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236)
* [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236)
- * [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
- * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
+ * [x] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236)
+ * [x] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236)
* [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236)
* [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236)
* [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236)
@@ -1370,8 +1370,8 @@
* [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236)
* [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236)
* [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236)
- * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
- * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
+ * [x] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236)
+ * [x] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236)
* [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236)
* [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236)
* [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index a7e298c550..270d8c5859 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -18417,6 +18417,68 @@ pub unsafe fn _mm512_set_pd(
_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
}
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let extractsrc: f32 = simd_extract(src, 0);
+ let mut mov: f32 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ mov = simd_extract(b, 0);
+ }
+ let r = simd_insert(a, 0, mov);
+ transmute(r)
+}
+
+/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovss))]
+pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
+ let mut mov: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ mov = simd_extract(b, 0);
+ }
+ let r = simd_insert(a, 0, mov);
+ transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let extractsrc: f64 = simd_extract(src, 0);
+ let mut mov: f64 = extractsrc;
+ if (k & 0b00000001) != 0 {
+ mov = simd_extract(b, 0);
+ }
+ let r = simd_insert(a, 0, mov);
+ transmute(r)
+}
+
+/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vmovsd))]
+pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
+ let mut mov: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ mov = simd_extract(b, 0);
+ }
+ let r = simd_insert(a, 0, mov);
+ transmute(r)
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
@@ -31265,6 +31327,56 @@ mod tests {
assert_eq_m512i(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_move_ss() {
+ let src = _mm_set_ps(10., 11., 100., 110.);
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_mask_move_ss(src, 0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 110.);
+ assert_eq_m128(r, e);
+ let r = _mm_mask_move_ss(src, 0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 40.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_move_ss() {
+ let a = _mm_set_ps(1., 2., 10., 20.);
+ let b = _mm_set_ps(3., 4., 30., 40.);
+ let r = _mm_maskz_move_ss(0, a, b);
+ let e = _mm_set_ps(1., 2., 10., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_move_ss(0b11111111, a, b);
+ let e = _mm_set_ps(1., 2., 10., 40.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_move_sd() {
+ let src = _mm_set_pd(10., 11.);
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_mask_move_sd(src, 0, a, b);
+ let e = _mm_set_pd(1., 11.);
+ assert_eq_m128d(r, e);
+ let r = _mm_mask_move_sd(src, 0b11111111, a, b);
+ let e = _mm_set_pd(1., 4.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_move_sd() {
+ let a = _mm_set_pd(1., 2.);
+ let b = _mm_set_pd(3., 4.);
+ let r = _mm_maskz_move_sd(0, a, b);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_move_sd(0b11111111, a, b);
+ let e = _mm_set_pd(1., 4.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_mask_add_ss() {
let src = _mm_set_ps(10., 11., 100., 110.);
From 577cbc4169fd5d136a05095f7efed0022d2a2d1e Mon Sep 17 00:00:00 2001
From: jironglin
Date: Fri, 16 Oct 2020 17:13:37 +0000
Subject: [PATCH 24/25] mask_fmadd: ss,sd; fmadd_round: ss,sd;
---
crates/core_arch/avx512f.md | 28 +-
crates/core_arch/src/x86/avx512f.rs | 587 ++++++++++++++++++++++++++++
2 files changed, 601 insertions(+), 14 deletions(-)
diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md
index ae452723b0..75b8c725df 100644
--- a/crates/core_arch/avx512f.md
+++ b/crates/core_arch/avx512f.md
@@ -1206,8 +1206,8 @@
* [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236)
* [ ] [`_mm_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_ss&expand=5236)
- * [ ] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236)
- * [ ] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236)
+ * [x] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236)
+ * [x] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236)
* [ ] [`_mm_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sd&expand=5236)
* [ ] [`_mm_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_ss&expand=5236)
* [ ] [`_mm_fnmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sd&expand=5236)
@@ -1222,10 +1222,10 @@
* [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236)
* [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236)
* [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236)
- * [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
- * [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
- * [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
- * [ ] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236)
+ * [x] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236)
+ * [x] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236)
+ * [x] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236)
+ * [x] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236)
* [ ] [`_mm_mask3_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sd&expand=5236)
* [ ] [`_mm_mask3_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_ss&expand=5236)
* [ ] [`_mm_mask3_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sd&expand=5236)
@@ -1258,10 +1258,10 @@
* [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236)
* [ ] [`_mm_mask_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_ss&expand=5236)
- * [ ] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236)
- * [ ] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236)
- * [ ] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236)
- * [ ] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236)
+ * [x] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236)
+ * [x] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236)
+ * [x] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236)
+ * [x] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236)
* [ ] [`_mm_mask_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sd&expand=5236)
* [ ] [`_mm_mask_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_ss&expand=5236)
* [ ] [`_mm_mask_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sd&expand=5236)
@@ -1336,10 +1336,10 @@
* [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236)
* [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236)
* [ ] [`_mm_maskz_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_ss&expand=5236)
- * [ ] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236)
- * [ ] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236)
- * [ ] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236)
- * [ ] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236)
+ * [x] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236)
+ * [x] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236)
+ * [x] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236)
+ * [x] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236)
* [ ] [`_mm_maskz_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sd&expand=5236)
* [ ] [`_mm_maskz_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_ss&expand=5236)
* [ ] [`_mm_maskz_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sd&expand=5236)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index 270d8c5859..d231c42b19 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -19735,6 +19735,110 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
))
}
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
+ let mut fmadd: f32 = simd_extract(a, 0);
+ if (k & 0b00000001) != 0 {
+ let extractb: f32 = simd_extract(b, 0);
+ let extractc: f32 = simd_extract(c, 0);
+ fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
+ let mut fmadd: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ let extractc: f32 = simd_extract(c, 0);
+ fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss))]
+pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
+ let mut fmadd: f32 = simd_extract(c, 0);
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(c, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
+ let mut fmadd: f64 = simd_extract(a, 0);
+ if (k & 0b00000001) != 0 {
+ let extractb: f64 = simd_extract(b, 0);
+ let extractc: f64 = simd_extract(c, 0);
+ fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
+ let mut fmadd: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ let extractc: f64 = simd_extract(c, 0);
+ fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd))]
+pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
+ let mut fmadd: f64 = simd_extract(c, 0);
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
+ }
+ let r = simd_insert(c, 0, fmadd);
+ transmute(r)
+}
+
/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
@@ -21686,6 +21790,280 @@ pub unsafe fn _mm_maskz_scalef_round_sd(
transmute(r)
}
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ let extractc: f32 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132ss(extracta, extractb, extractc, $imm4)
+ };
+ }
+ let fmadd = constify_imm4_round!(rounding, call);
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_fmadd_round_ss(
+ a: __m128,
+ k: __mmask8,
+ b: __m128,
+ c: __m128,
+ rounding: i32,
+) -> __m128 {
+ let mut fmadd: f32 = simd_extract(a, 0);
+ if (k & 0b00000001) != 0 {
+ let extractb: f32 = simd_extract(b, 0);
+ let extractc: f32 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132ss(fmadd, extractb, extractc, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_maskz_fmadd_round_ss(
+ k: __mmask8,
+ a: __m128,
+ b: __m128,
+ c: __m128,
+ rounding: i32,
+) -> __m128 {
+ let mut fmadd: f32 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ let extractc: f32 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132ss(extracta, extractb, extractc, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask3_fmadd_round_ss(
+ a: __m128,
+ b: __m128,
+ c: __m128,
+ k: __mmask8,
+ rounding: i32,
+) -> __m128 {
+ let mut fmadd: f32 = simd_extract(c, 0);
+ if (k & 0b00000001) != 0 {
+ let extracta: f32 = simd_extract(a, 0);
+ let extractb: f32 = simd_extract(b, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132ss(extracta, extractb, fmadd, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(c, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(3)]
+pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ let extractc: f64 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132sd(extracta, extractb, extractc, $imm4)
+ };
+ }
+ let fmadd = constify_imm4_round!(rounding, call);
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask_fmadd_round_sd(
+ a: __m128d,
+ k: __mmask8,
+ b: __m128d,
+ c: __m128d,
+ rounding: i32,
+) -> __m128d {
+ let mut fmadd: f64 = simd_extract(a, 0);
+ if (k & 0b00000001) != 0 {
+ let extractb: f64 = simd_extract(b, 0);
+ let extractc: f64 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132sd(fmadd, extractb, extractc, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_maskz_fmadd_round_sd(
+ k: __mmask8,
+ a: __m128d,
+ b: __m128d,
+ c: __m128d,
+ rounding: i32,
+) -> __m128d {
+ let mut fmadd: f64 = 0.;
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ let extractc: f64 = simd_extract(c, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132sd(extracta, extractb, extractc, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(a, 0, fmadd);
+ transmute(r)
+}
+
+/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
+///
+/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
+/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
+/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
+/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
+/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
+/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
+#[inline]
+#[target_feature(enable = "avx512f")]
+#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
+#[rustc_args_required_const(4)]
+pub unsafe fn _mm_mask3_fmadd_round_sd(
+ a: __m128d,
+ b: __m128d,
+ c: __m128d,
+ k: __mmask8,
+ rounding: i32,
+) -> __m128d {
+ let mut fmadd: f64 = simd_extract(c, 0);
+ if (k & 0b00000001) != 0 {
+ let extracta: f64 = simd_extract(a, 0);
+ let extractb: f64 = simd_extract(b, 0);
+ macro_rules! call {
+ ($imm4:expr) => {
+ vfmadd132sd(extracta, extractb, fmadd, $imm4)
+ };
+ }
+ fmadd = constify_imm4_round!(rounding, call);
+ }
+ let r = simd_insert(c, 0, fmadd);
+ transmute(r)
+}
+
/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
@@ -22343,6 +22721,11 @@ extern "C" {
fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
#[link_name = "llvm.x86.avx512.mask.scalef.sd"]
fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
+
+ #[link_name = "llvm.x86.avx512.vfmadd.f32"]
+ fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
+ #[link_name = "llvm.x86.avx512.vfmadd.f64"]
+ fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;
}
#[cfg(test)]
@@ -32123,6 +32506,80 @@ mod tests {
assert_eq_m128d(r, e);
}
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_fmadd_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_mask_fmadd_ss(a, 0, b, c);
+ assert_eq_m128(r, a);
+ let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
+ let e = _mm_set_ps(1., 1., 1., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_fmadd_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_maskz_fmadd_ss(0, a, b, c);
+ let e = _mm_set_ps(1., 1., 1., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
+ let e = _mm_set_ps(1., 1., 1., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask3_fmadd_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_mask3_fmadd_ss(a, b, c, 0);
+ assert_eq_m128(r, c);
+ let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
+ let e = _mm_set_ps(3., 3., 3., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_fmadd_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_mask_fmadd_sd(a, 0, b, c);
+ assert_eq_m128d(r, a);
+ let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
+ let e = _mm_set_pd(1., 5.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_fmadd_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_maskz_fmadd_sd(0, a, b, c);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
+ let e = _mm_set_pd(1., 5.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask3_fmadd_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_mask3_fmadd_sd(a, b, c, 0);
+ assert_eq_m128d(r, c);
+ let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
+ let e = _mm_set_pd(3., 5.);
+ assert_eq_m128d(r, e);
+ }
+
#[simd_test(enable = "avx512f")]
unsafe fn test_mm_add_round_ss() {
let a = _mm_set_ps(1., 2., 10., 20.);
@@ -33012,4 +33469,134 @@ mod tests {
let e = _mm_set_pd(1., 8.);
assert_eq_m128d(r, e);
}
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_fmadd_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 1., 1., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_fmadd_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m128(r, a);
+ let r = _mm_mask_fmadd_round_ss(
+ a,
+ 0b11111111,
+ b,
+ c,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 1., 1., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_fmadd_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_ps(1., 1., 1., 0.);
+ assert_eq_m128(r, e);
+ let r = _mm_maskz_fmadd_round_ss(
+ 0b11111111,
+ a,
+ b,
+ c,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(1., 1., 1., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask3_fmadd_round_ss() {
+ let a = _mm_set1_ps(1.);
+ let b = _mm_set1_ps(2.);
+ let c = _mm_set1_ps(3.);
+ let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m128(r, c);
+ let r = _mm_mask3_fmadd_round_ss(
+ a,
+ b,
+ c,
+ 0b11111111,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_ps(3., 3., 3., 5.);
+ assert_eq_m128(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_fmadd_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 5.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask_fmadd_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m128d(r, a);
+ let r = _mm_mask_fmadd_round_sd(
+ a,
+ 0b11111111,
+ b,
+ c,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 5.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_maskz_fmadd_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ let e = _mm_set_pd(1., 0.);
+ assert_eq_m128d(r, e);
+ let r = _mm_maskz_fmadd_round_sd(
+ 0b11111111,
+ a,
+ b,
+ c,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(1., 5.);
+ assert_eq_m128d(r, e);
+ }
+
+ #[simd_test(enable = "avx512f")]
+ unsafe fn test_mm_mask3_fmadd_round_sd() {
+ let a = _mm_set1_pd(1.);
+ let b = _mm_set1_pd(2.);
+ let c = _mm_set1_pd(3.);
+ let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+ assert_eq_m128d(r, c);
+ let r = _mm_mask3_fmadd_round_sd(
+ a,
+ b,
+ c,
+ 0b11111111,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
+ );
+ let e = _mm_set_pd(3., 5.);
+ assert_eq_m128d(r, e);
+ }
}
From 17659b26a6ea8debfec02536b42d05f4440d52e4 Mon Sep 17 00:00:00 2001
From: jironglin
Date: Fri, 16 Oct 2020 17:42:48 +0000
Subject: [PATCH 25/25] fix duplicated comment, remove assert int2mask
---
crates/core_arch/src/x86/avx512f.rs | 3 ---
1 file changed, 3 deletions(-)
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index d231c42b19..7cea13c48c 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -2632,8 +2632,6 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi64(
transmute(simd_select_bitmask(k, ternarylogic, zero))
}
-/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
-
/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
/// _MM_MANT_NORM_1_2 // interval [1, 2)
@@ -15910,7 +15908,6 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
#[inline]
#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
- assert!(mask >= 0);
let r: u16 = mask as u16;
transmute(r)
}