From 972452ab43aed83e9812a3c5b15355c837ac1aa3 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 13:21:20 +0000 Subject: [PATCH 01/25] roundscale_round: ps,pd --- crates/core_arch/avx512f.md | 12 +- crates/core_arch/src/x86/avx512f.rs | 226 +++++++++++ crates/core_arch/src/x86/macros.rs | 524 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 29 ++ 4 files changed, 785 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 110c31eeea..58be7ab632 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -609,8 +609,8 @@ * [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236) * [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236) * [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236) - * [ ] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236) - * [ ] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236) + * [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236) + * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236) * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236) * [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236) * [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236) @@ -881,8 +881,8 @@ * [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236) * [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236) * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236) - * [ ] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236) - * [ ] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236) + * [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236) + * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236) * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236) * [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236) * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236) @@ -1019,8 +1019,8 @@ * [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236) * [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236) * [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236) - * [ ] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236) - * [ ] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236) + * [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236) + * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236) * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236) * [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236) * [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 25b91f7d9e..90b707d93c 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -4845,6 +4845,196 @@ pub unsafe fn _mm512_maskz_getexp_round_pd(k: __mmask8, a: __m512d, sae: i32) -> transmute(r) } +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_ps&expand=4790) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(1, 2)] +pub unsafe fn _mm512_roundscale_round_ps(a: __m512, imm8: i32, sae: i32) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaleps( + a.as_f32x16(), + $imm8, + _mm512_setzero_ps().as_f32x16(), + 0b11111111_11111111, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_ps&expand=4788) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm512_mask_roundscale_round_ps( + src: __m512, + k: __mmask16, + a: __m512, + imm8: i32, + sae: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaleps(a.as_f32x16(), $imm8, src.as_f32x16(), k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_ps&expand=4789) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm512_maskz_roundscale_round_ps( + k: __mmask16, + a: __m512, + imm8: i32, + sae: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaleps( + a.as_f32x16(), + $imm8, + _mm512_setzero_ps().as_f32x16(), + k, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_pd&expand=4787) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(1, 2)] +pub unsafe fn _mm512_roundscale_round_pd(a: __m512d, imm8: i32, sae: i32) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalepd( + a.as_f64x8(), + $imm8, + _mm512_setzero_pd().as_f64x8(), + 0b11111111, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_pd&expand=4785) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm512_mask_roundscale_round_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + imm8: i32, + sae: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalepd(a.as_f64x8(), $imm8, src.as_f64x8(), k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_pd&expand=4786) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm512_maskz_roundscale_round_pd( + k: __mmask8, + a: __m512d, + imm8: i32, + sae: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalepd( + a.as_f64x8(), + $imm8, + _mm512_setzero_pd().as_f64x8(), + k, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -17206,6 +17396,11 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"] fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"] + fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"] + fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"] fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16; #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"] @@ -20630,6 +20825,37 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_roundscale_round_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_roundscale_round_ps(a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_roundscale_round_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_mask_roundscale_round_ps(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_ps(1.1); + assert_eq_m512(r, e); + let r = + _mm512_mask_roundscale_round_ps(a, 0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_roundscale_round_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_maskz_roundscale_round_ps(0, a, 0, _MM_FROUND_CUR_DIRECTION); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = + _mm512_maskz_roundscale_round_ps(0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_ps() { let a = _mm512_set1_ps(10.); diff --git a/crates/core_arch/src/x86/macros.rs b/crates/core_arch/src/x86/macros.rs index 891286df4e..0f9d938eaa 100644 --- a/crates/core_arch/src/x86/macros.rs +++ b/crates/core_arch/src/x86/macros.rs @@ -733,6 +733,530 @@ macro_rules! constify_imm8_sae { }; } +// Two sae parameters. +// This macro enforces that. +#[allow(unused)] +macro_rules! constify_imm8_roundscale { + ($imm8:expr, $imm4:expr, $expand:ident) => { + #[allow(overflowing_literals)] + match ($imm8 & 0b11111111, $imm4) { + (0, 4) => $expand!(0, 4), + (0, 8) => $expand!(0, 8), + (1, 4) => $expand!(1, 4), + (1, 8) => $expand!(1, 8), + (2, 4) => $expand!(2, 4), + (2, 8) => $expand!(2, 8), + (3, 4) => $expand!(3, 4), + (3, 8) => $expand!(3, 8), + (4, 4) => $expand!(4, 4), + (4, 8) => $expand!(4, 8), + (5, 4) => $expand!(5, 4), + (5, 8) => $expand!(5, 8), + (6, 4) => $expand!(6, 4), + (6, 8) => $expand!(6, 8), + (7, 4) => $expand!(7, 4), + (7, 8) => $expand!(7, 8), + (8, 4) => $expand!(8, 4), + (8, 8) => $expand!(8, 8), + (9, 4) => $expand!(9, 4), + (9, 8) => $expand!(9, 8), + (10, 4) => $expand!(10, 4), + (10, 8) => $expand!(10, 8), + (11, 4) => $expand!(11, 4), + (11, 8) => $expand!(11, 8), + (12, 4) => $expand!(12, 4), + (12, 8) => $expand!(12, 8), + (13, 4) => $expand!(13, 4), + (13, 8) => $expand!(13, 8), + (14, 4) => $expand!(14, 4), + (14, 8) => $expand!(14, 8), + (15, 4) => $expand!(15, 4), + (15, 8) => $expand!(15, 8), + (16, 4) => $expand!(16, 4), + (16, 8) => $expand!(16, 8), + (17, 4) => $expand!(17, 4), + (17, 8) => $expand!(17, 8), + (18, 4) => $expand!(18, 4), + (18, 8) => $expand!(18, 8), + (19, 4) => $expand!(19, 4), + (19, 8) => $expand!(19, 8), + (20, 4) => $expand!(20, 4), + (20, 8) => $expand!(20, 8), + (21, 4) => $expand!(21, 4), + (21, 8) => $expand!(21, 8), + (22, 4) => $expand!(22, 4), + (22, 8) => $expand!(22, 8), + (23, 4) => $expand!(23, 4), + (23, 8) => $expand!(23, 8), + (24, 4) => $expand!(24, 4), + (24, 8) => $expand!(24, 8), + (25, 4) => $expand!(25, 4), + (25, 8) => $expand!(25, 8), + (26, 4) => $expand!(26, 4), + (26, 8) => $expand!(26, 8), + (27, 4) => $expand!(27, 4), + (27, 8) => $expand!(27, 8), + (28, 4) => $expand!(28, 4), + (28, 8) => $expand!(28, 8), + (29, 4) => $expand!(29, 4), + (29, 8) => $expand!(29, 8), + (30, 4) => $expand!(30, 4), + (30, 8) => $expand!(30, 8), + (31, 4) => $expand!(31, 4), + (31, 8) => $expand!(31, 8), + (32, 4) => $expand!(32, 4), + (32, 8) => $expand!(32, 8), + (33, 4) => $expand!(33, 4), + (33, 8) => $expand!(33, 8), + (34, 4) => $expand!(34, 4), + (34, 8) => $expand!(34, 8), + (35, 4) => $expand!(35, 4), + (35, 8) => $expand!(35, 8), + (36, 4) => $expand!(36, 4), + (36, 8) => $expand!(36, 8), + (37, 4) => $expand!(37, 4), + (37, 8) => $expand!(37, 8), + (38, 4) => $expand!(38, 4), + (38, 8) => $expand!(38, 8), + (39, 4) => $expand!(39, 4), + (39, 8) => $expand!(39, 8), + (40, 4) => $expand!(40, 4), + (40, 8) => $expand!(40, 8), + (41, 4) => $expand!(41, 4), + (41, 8) => $expand!(41, 8), + (42, 4) => $expand!(42, 4), + (42, 8) => $expand!(42, 8), + (43, 4) => $expand!(43, 4), + (43, 8) => $expand!(43, 8), + (44, 4) => $expand!(44, 4), + (44, 8) => $expand!(44, 8), + (45, 4) => $expand!(45, 4), + (45, 8) => $expand!(45, 8), + (46, 4) => $expand!(46, 4), + (46, 8) => $expand!(46, 8), + (47, 4) => $expand!(47, 4), + (47, 8) => $expand!(47, 8), + (48, 4) => $expand!(48, 4), + (48, 8) => $expand!(48, 8), + (49, 4) => $expand!(49, 4), + (49, 8) => $expand!(49, 8), + (50, 4) => $expand!(50, 4), + (50, 8) => $expand!(50, 8), + (51, 4) => $expand!(51, 4), + (51, 8) => $expand!(51, 8), + (52, 4) => $expand!(52, 4), + (52, 8) => $expand!(52, 8), + (53, 4) => $expand!(53, 4), + (53, 8) => $expand!(53, 8), + (54, 4) => $expand!(54, 4), + (54, 8) => $expand!(54, 8), + (55, 4) => $expand!(55, 4), + (55, 8) => $expand!(55, 8), + (56, 4) => $expand!(56, 4), + (56, 8) => $expand!(56, 8), + (57, 4) => $expand!(57, 4), + (57, 8) => $expand!(57, 8), + (58, 4) => $expand!(58, 4), + (58, 8) => $expand!(58, 8), + (59, 4) => $expand!(59, 4), + (59, 8) => $expand!(59, 8), + (60, 4) => $expand!(60, 4), + (60, 8) => $expand!(60, 8), + (61, 4) => $expand!(61, 4), + (61, 8) => $expand!(61, 8), + (62, 4) => $expand!(62, 4), + (62, 8) => $expand!(62, 8), + (63, 4) => $expand!(63, 4), + (63, 8) => $expand!(63, 8), + (64, 4) => $expand!(64, 4), + (64, 8) => $expand!(64, 8), + (65, 4) => $expand!(65, 4), + (65, 8) => $expand!(65, 8), + (66, 4) => $expand!(66, 4), + (66, 8) => $expand!(66, 8), + (67, 4) => $expand!(67, 4), + (67, 8) => $expand!(67, 8), + (68, 4) => $expand!(68, 4), + (68, 8) => $expand!(68, 8), + (69, 4) => $expand!(69, 4), + (69, 8) => $expand!(69, 8), + (70, 4) => $expand!(70, 4), + (70, 8) => $expand!(70, 8), + (71, 4) => $expand!(71, 4), + (71, 8) => $expand!(71, 8), + (72, 4) => $expand!(72, 4), + (72, 8) => $expand!(72, 8), + (73, 4) => $expand!(73, 4), + (73, 8) => $expand!(73, 8), + (74, 4) => $expand!(74, 4), + (74, 8) => $expand!(74, 8), + (75, 4) => $expand!(75, 4), + (75, 8) => $expand!(75, 8), + (76, 4) => $expand!(76, 4), + (76, 8) => $expand!(76, 8), + (77, 4) => $expand!(77, 4), + (77, 8) => $expand!(77, 8), + (78, 4) => $expand!(78, 4), + (78, 8) => $expand!(78, 8), + (79, 4) => $expand!(79, 4), + (79, 8) => $expand!(79, 8), + (80, 4) => $expand!(80, 4), + (80, 8) => $expand!(80, 8), + (81, 4) => $expand!(81, 4), + (81, 8) => $expand!(81, 8), + (82, 4) => $expand!(82, 4), + (82, 8) => $expand!(82, 8), + (83, 4) => $expand!(83, 4), + (83, 8) => $expand!(83, 8), + (84, 4) => $expand!(84, 4), + (84, 8) => $expand!(84, 8), + (85, 4) => $expand!(85, 4), + (85, 8) => $expand!(85, 8), + (86, 4) => $expand!(86, 4), + (86, 8) => $expand!(86, 8), + (87, 4) => $expand!(87, 4), + (87, 8) => $expand!(87, 8), + (88, 4) => $expand!(88, 4), + (88, 8) => $expand!(88, 8), + (89, 4) => $expand!(89, 4), + (89, 8) => $expand!(89, 8), + (90, 4) => $expand!(90, 4), + (90, 8) => $expand!(90, 8), + (91, 4) => $expand!(91, 4), + (91, 8) => $expand!(91, 8), + (92, 4) => $expand!(92, 4), + (92, 8) => $expand!(92, 8), + (93, 4) => $expand!(93, 4), + (93, 8) => $expand!(93, 8), + (94, 4) => $expand!(94, 4), + (94, 8) => $expand!(94, 8), + (95, 4) => $expand!(95, 4), + (95, 8) => $expand!(95, 8), + (96, 4) => $expand!(96, 4), + (96, 8) => $expand!(96, 8), + (97, 4) => $expand!(97, 4), + (97, 8) => $expand!(97, 8), + (98, 4) => $expand!(98, 4), + (98, 8) => $expand!(98, 8), + (99, 4) => $expand!(99, 4), + (99, 8) => $expand!(99, 8), + (100, 4) => $expand!(100, 4), + (100, 8) => $expand!(100, 8), + (101, 4) => $expand!(101, 4), + (101, 8) => $expand!(101, 8), + (102, 4) => $expand!(102, 4), + (102, 8) => $expand!(102, 8), + (103, 4) => $expand!(103, 4), + (103, 8) => $expand!(103, 8), + (104, 4) => $expand!(104, 4), + (104, 8) => $expand!(104, 8), + (105, 4) => $expand!(105, 4), + (105, 8) => $expand!(105, 8), + (106, 4) => $expand!(106, 4), + (106, 8) => $expand!(106, 8), + (107, 4) => $expand!(107, 4), + (107, 8) => $expand!(107, 8), + (108, 4) => $expand!(108, 4), + (108, 8) => $expand!(108, 8), + (109, 4) => $expand!(109, 4), + (109, 8) => $expand!(109, 8), + (110, 4) => $expand!(110, 4), + (110, 8) => $expand!(110, 8), + (111, 4) => $expand!(111, 4), + (111, 8) => $expand!(111, 8), + (112, 4) => $expand!(112, 4), + (112, 8) => $expand!(112, 8), + (113, 4) => $expand!(113, 4), + (113, 8) => $expand!(113, 8), + (114, 4) => $expand!(114, 4), + (114, 8) => $expand!(114, 8), + (115, 4) => $expand!(115, 4), + (115, 8) => $expand!(115, 8), + (116, 4) => $expand!(116, 4), + (116, 8) => $expand!(116, 8), + (117, 4) => $expand!(117, 4), + (117, 8) => $expand!(117, 8), + (118, 4) => $expand!(118, 4), + (118, 8) => $expand!(118, 8), + (119, 4) => $expand!(119, 4), + (119, 8) => $expand!(119, 8), + (120, 4) => $expand!(120, 4), + (120, 8) => $expand!(120, 8), + (121, 4) => $expand!(121, 4), + (121, 8) => $expand!(121, 8), + (122, 4) => $expand!(122, 4), + (122, 8) => $expand!(122, 8), + (123, 4) => $expand!(123, 4), + (123, 8) => $expand!(123, 8), + (124, 4) => $expand!(124, 4), + (124, 8) => $expand!(124, 8), + (125, 4) => $expand!(125, 4), + (125, 8) => $expand!(125, 8), + (126, 4) => $expand!(126, 4), + (126, 8) => $expand!(126, 8), + (127, 4) => $expand!(127, 4), + (127, 8) => $expand!(127, 8), + (128, 4) => $expand!(128, 4), + (128, 8) => $expand!(128, 8), + (129, 4) => $expand!(129, 4), + (129, 8) => $expand!(129, 8), + (130, 4) => $expand!(130, 4), + (130, 8) => $expand!(130, 8), + (131, 4) => $expand!(131, 4), + (131, 8) => $expand!(131, 8), + (132, 4) => $expand!(132, 4), + (132, 8) => $expand!(132, 8), + (133, 4) => $expand!(133, 4), + (133, 8) => $expand!(133, 8), + (134, 4) => $expand!(134, 4), + (134, 8) => $expand!(134, 8), + (135, 4) => $expand!(135, 4), + (135, 8) => $expand!(135, 8), + (136, 4) => $expand!(136, 4), + (136, 8) => $expand!(136, 8), + (137, 4) => $expand!(137, 4), + (137, 8) => $expand!(137, 8), + (138, 4) => $expand!(138, 4), + (138, 8) => $expand!(138, 8), + (139, 4) => $expand!(139, 4), + (139, 8) => $expand!(139, 8), + (140, 4) => $expand!(140, 4), + (140, 8) => $expand!(140, 8), + (141, 4) => $expand!(141, 4), + (141, 8) => $expand!(141, 8), + (142, 4) => $expand!(142, 4), + (142, 8) => $expand!(142, 8), + (143, 4) => $expand!(143, 4), + (143, 8) => $expand!(143, 8), + (144, 4) => $expand!(144, 4), + (144, 8) => $expand!(144, 8), + (145, 4) => $expand!(145, 4), + (145, 8) => $expand!(145, 8), + (146, 4) => $expand!(146, 4), + (146, 8) => $expand!(146, 8), + (147, 4) => $expand!(147, 4), + (147, 8) => $expand!(147, 8), + (148, 4) => $expand!(148, 4), + (148, 8) => $expand!(148, 8), + (149, 4) => $expand!(149, 4), + (149, 8) => $expand!(149, 8), + (150, 4) => $expand!(150, 4), + (150, 8) => $expand!(150, 8), + (151, 4) => $expand!(151, 4), + (151, 8) => $expand!(151, 8), + (152, 4) => $expand!(152, 4), + (152, 8) => $expand!(152, 8), + (153, 4) => $expand!(153, 4), + (153, 8) => $expand!(153, 8), + (154, 4) => $expand!(154, 4), + (154, 8) => $expand!(154, 8), + (155, 4) => $expand!(155, 4), + (155, 8) => $expand!(155, 8), + (156, 4) => $expand!(156, 4), + (156, 8) => $expand!(156, 8), + (157, 4) => $expand!(157, 4), + (157, 8) => $expand!(157, 8), + (158, 4) => $expand!(158, 4), + (158, 8) => $expand!(158, 8), + (159, 4) => $expand!(159, 4), + (159, 8) => $expand!(159, 8), + (160, 4) => $expand!(160, 4), + (160, 8) => $expand!(160, 8), + (161, 4) => $expand!(161, 4), + (161, 8) => $expand!(161, 8), + (162, 4) => $expand!(162, 4), + (162, 8) => $expand!(162, 8), + (163, 4) => $expand!(163, 4), + (163, 8) => $expand!(163, 8), + (164, 4) => $expand!(164, 4), + (164, 8) => $expand!(164, 8), + (165, 4) => $expand!(165, 4), + (165, 8) => $expand!(165, 8), + (166, 4) => $expand!(166, 4), + (166, 8) => $expand!(166, 8), + (167, 4) => $expand!(167, 4), + (167, 8) => $expand!(167, 8), + (168, 4) => $expand!(168, 4), + (168, 8) => $expand!(168, 8), + (169, 4) => $expand!(169, 4), + (169, 8) => $expand!(169, 8), + (170, 4) => $expand!(170, 4), + (170, 8) => $expand!(170, 8), + (171, 4) => $expand!(171, 4), + (171, 8) => $expand!(171, 8), + (172, 4) => $expand!(172, 4), + (172, 8) => $expand!(172, 8), + (173, 4) => $expand!(173, 4), + (173, 8) => $expand!(173, 8), + (174, 4) => $expand!(174, 4), + (174, 8) => $expand!(174, 8), + (175, 4) => $expand!(175, 4), + (175, 8) => $expand!(175, 8), + (176, 4) => $expand!(176, 4), + (176, 8) => $expand!(176, 8), + (177, 4) => $expand!(177, 4), + (177, 8) => $expand!(177, 8), + (178, 4) => $expand!(178, 4), + (178, 8) => $expand!(178, 8), + (179, 4) => $expand!(179, 4), + (179, 8) => $expand!(179, 8), + (180, 4) => $expand!(180, 4), + (180, 8) => $expand!(180, 8), + (181, 4) => $expand!(181, 4), + (181, 8) => $expand!(181, 8), + (182, 4) => $expand!(182, 4), + (182, 8) => $expand!(182, 8), + (183, 4) => $expand!(183, 4), + (183, 8) => $expand!(183, 8), + (184, 4) => $expand!(184, 4), + (184, 8) => $expand!(184, 8), + (185, 4) => $expand!(185, 4), + (185, 8) => $expand!(185, 8), + (186, 4) => $expand!(186, 4), + (186, 8) => $expand!(186, 8), + (187, 4) => $expand!(187, 4), + (187, 8) => $expand!(187, 8), + (188, 4) => $expand!(188, 4), + (188, 8) => $expand!(188, 8), + (189, 4) => $expand!(189, 4), + (189, 8) => $expand!(189, 8), + (190, 4) => $expand!(190, 4), + (190, 8) => $expand!(190, 8), + (191, 4) => $expand!(191, 4), + (191, 8) => $expand!(191, 8), + (192, 4) => $expand!(192, 4), + (192, 8) => $expand!(192, 8), + (193, 4) => $expand!(193, 4), + (193, 8) => $expand!(193, 8), + (194, 4) => $expand!(194, 4), + (194, 8) => $expand!(194, 8), + (195, 4) => $expand!(195, 4), + (195, 8) => $expand!(195, 8), + (196, 4) => $expand!(196, 4), + (196, 8) => $expand!(196, 8), + (197, 4) => $expand!(197, 4), + (197, 8) => $expand!(197, 8), + (198, 4) => $expand!(198, 4), + (198, 8) => $expand!(198, 8), + (199, 4) => $expand!(199, 4), + (199, 8) => $expand!(199, 8), + (200, 4) => $expand!(200, 4), + (200, 8) => $expand!(200, 8), + (201, 4) => $expand!(201, 4), + (201, 8) => $expand!(201, 8), + (202, 4) => $expand!(202, 4), + (202, 8) => $expand!(202, 8), + (203, 4) => $expand!(203, 4), + (203, 8) => $expand!(203, 8), + (204, 4) => $expand!(204, 4), + (204, 8) => $expand!(204, 8), + (205, 4) => $expand!(205, 4), + (205, 8) => $expand!(205, 8), + (206, 4) => $expand!(206, 4), + (206, 8) => $expand!(206, 8), + (207, 4) => $expand!(207, 4), + (207, 8) => $expand!(207, 8), + (208, 4) => $expand!(208, 4), + (208, 8) => $expand!(208, 8), + (209, 4) => $expand!(209, 4), + (209, 8) => $expand!(209, 8), + (210, 4) => $expand!(210, 4), + (210, 8) => $expand!(210, 8), + (211, 4) => $expand!(211, 4), + (211, 8) => $expand!(211, 8), + (212, 4) => $expand!(212, 4), + (212, 8) => $expand!(212, 8), + (213, 4) => $expand!(213, 4), + (213, 8) => $expand!(213, 8), + (214, 4) => $expand!(214, 4), + (214, 8) => $expand!(214, 8), + (215, 4) => $expand!(215, 4), + (215, 8) => $expand!(215, 8), + (216, 4) => $expand!(216, 4), + (216, 8) => $expand!(216, 8), + (217, 4) => $expand!(217, 4), + (217, 8) => $expand!(217, 8), + (218, 4) => $expand!(218, 4), + (218, 8) => $expand!(218, 8), + (219, 4) => $expand!(219, 4), + (219, 8) => $expand!(219, 8), + (220, 4) => $expand!(220, 4), + (220, 8) => $expand!(220, 8), + (221, 4) => $expand!(221, 4), + (221, 8) => $expand!(221, 8), + (222, 4) => $expand!(222, 4), + (222, 8) => $expand!(222, 8), + (223, 4) => $expand!(223, 4), + (223, 8) => $expand!(223, 8), + (224, 4) => $expand!(224, 4), + (224, 8) => $expand!(224, 8), + (225, 4) => $expand!(225, 4), + (225, 8) => $expand!(225, 8), + (226, 4) => $expand!(226, 4), + (226, 8) => $expand!(226, 8), + (227, 4) => $expand!(227, 4), + (227, 8) => $expand!(227, 8), + (228, 4) => $expand!(228, 4), + (228, 8) => $expand!(228, 8), + (229, 4) => $expand!(229, 4), + (229, 8) => $expand!(229, 8), + (230, 4) => $expand!(230, 4), + (230, 8) => $expand!(230, 8), + (231, 4) => $expand!(231, 4), + (231, 8) => $expand!(231, 8), + (232, 4) => $expand!(232, 4), + (232, 8) => $expand!(232, 8), + (233, 4) => $expand!(233, 4), + (233, 8) => $expand!(233, 8), + (234, 4) => $expand!(234, 4), + (234, 8) => $expand!(234, 8), + (235, 4) => $expand!(235, 4), + (235, 8) => $expand!(235, 8), + (236, 4) => $expand!(236, 4), + (236, 8) => $expand!(236, 8), + (237, 4) => $expand!(237, 4), + (237, 8) => $expand!(237, 8), + (238, 4) => $expand!(238, 4), + (238, 8) => $expand!(238, 8), + (239, 4) => $expand!(239, 4), + (239, 8) => $expand!(239, 8), + (240, 4) => $expand!(240, 4), + (240, 8) => $expand!(240, 8), + (241, 4) => $expand!(241, 4), + (241, 8) => $expand!(241, 8), + (242, 4) => $expand!(242, 4), + (242, 8) => $expand!(242, 8), + (243, 4) => $expand!(243, 4), + (243, 8) => $expand!(243, 8), + (244, 4) => $expand!(244, 4), + (244, 8) => $expand!(244, 8), + (245, 4) => $expand!(245, 4), + (245, 8) => $expand!(245, 8), + (246, 4) => $expand!(246, 4), + (246, 8) => $expand!(246, 8), + (247, 4) => $expand!(247, 4), + (247, 8) => $expand!(247, 8), + (248, 4) => $expand!(248, 4), + (248, 8) => $expand!(248, 8), + (249, 4) => $expand!(249, 4), + (249, 8) => $expand!(249, 8), + (250, 4) => $expand!(250, 4), + (250, 8) => $expand!(250, 8), + (251, 4) => $expand!(251, 4), + (251, 8) => $expand!(251, 8), + (252, 4) => $expand!(252, 4), + (252, 8) => $expand!(252, 8), + (253, 4) => $expand!(253, 4), + (253, 8) => $expand!(253, 8), + (254, 4) => $expand!(254, 4), + (254, 8) => $expand!(254, 8), + (255, 4) => $expand!(255, 4), + (255, 8) => $expand!(255, 8), + (_, _) => panic!("Invalid sae value"), + } + }; +} + #[cfg(test)] macro_rules! assert_approx_eq { ($a:expr, $b:expr, $eps:expr) => {{ diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 1eca091c6d..6efe80b62a 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -2686,6 +2686,35 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_roundscale_round_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_roundscale_round_pd(a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_roundscale_round_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_mask_roundscale_round_pd(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(1.1); + assert_eq_m512d(r, e); + let r = _mm512_mask_roundscale_round_pd(a, 0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_roundscale_round_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_maskz_roundscale_round_pd(0, a, 0, _MM_FROUND_CUR_DIRECTION); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_roundscale_round_pd(0b11111111, a, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_pd() { let a = _mm512_set1_pd(10.); From 751acda5253fc0dcf3e20c1c6cf1927bb5c0c0bd Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 13:37:24 +0000 Subject: [PATCH 02/25] roundscale: ps,pd --- crates/core_arch/avx512f.md | 12 +- crates/core_arch/src/x86/avx512f.rs | 208 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 29 ++++ 3 files changed, 243 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 58be7ab632..50f8179955 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -607,8 +607,8 @@ * [x] [`_mm512_mask_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=5236) * [x] [`_mm512_mask_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi32&expand=5236) * [x] [`_mm512_mask_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=5236) - * [ ] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236) - * [ ] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236) + * [x] [`_mm512_mask_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=5236) + * [x] [`_mm512_mask_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=5236) * [x] [`_mm512_mask_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=5236) * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236) * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236) @@ -879,8 +879,8 @@ * [x] [`_mm512_maskz_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=5236) * [x] [`_mm512_maskz_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi32&expand=5236) * [x] [`_mm512_maskz_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=5236) - * [ ] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236) - * [ ] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236) + * [x] [`_mm512_maskz_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=5236) + * [x] [`_mm512_maskz_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=5236) * [x] [`_mm512_maskz_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=5236) * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236) * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236) @@ -1017,8 +1017,8 @@ * [x] [`_mm512_ror_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=5236) * [x] [`_mm512_rorv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi32&expand=5236) * [x] [`_mm512_rorv_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=5236) - * [ ] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236) - * [ ] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236) + * [x] [`_mm512_roundscale_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=5236) + * [x] [`_mm512_roundscale_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=5236) * [x] [`_mm512_roundscale_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=5236) * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236) * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 90b707d93c..1077f18c44 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -1969,6 +1969,185 @@ pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d { )) } +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_ps&expand=4784) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_roundscale_ps(a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vrndscaleps( + a.as_f32x16(), + $imm8, + _mm512_setzero_ps().as_f32x16(), + 0b11111111_11111111, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_ps&expand=4782) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_roundscale_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vrndscaleps( + a.as_f32x16(), + $imm8, + src.as_f32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_ps&expand=4783) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_roundscale_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vrndscaleps( + a.as_f32x16(), + $imm8, + _mm512_setzero_ps().as_f32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_pd&expand=4775) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))] +#[rustc_args_required_const(1)] +pub unsafe fn _mm512_roundscale_pd(a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vrndscalepd( + a.as_f64x8(), + $imm8, + _mm512_setzero_pd().as_f64x8(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_pd&expand=4773) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_mask_roundscale_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + imm8: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vrndscalepd( + a.as_f64x8(), + $imm8, + src.as_f64x8(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_pd&expand=4774) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vrndscalepd( + a.as_f64x8(), + $imm8, + _mm512_setzero_pd().as_f64x8(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -19470,6 +19649,35 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_roundscale_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_roundscale_ps(a, 0); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_roundscale_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_mask_roundscale_ps(a, 0, a, 0); + let e = _mm512_set1_ps(1.1); + assert_eq_m512(r, e); + let r = _mm512_mask_roundscale_ps(a, 0b11111111_11111111, a, 0); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_roundscale_ps() { + let a = _mm512_set1_ps(1.1); + let r = _mm512_maskz_roundscale_ps(0, a, 0); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_roundscale_ps(0b11111111_11111111, a, 0); + let e = _mm512_set1_ps(1.0); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_ps() { let a = _mm512_set1_ps(10.); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 6efe80b62a..b653a55bcb 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -995,6 +995,35 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_roundscale_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_roundscale_pd(a, 0); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_roundscale_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_mask_roundscale_pd(a, 0, a, 0); + let e = _mm512_set1_pd(1.1); + assert_eq_m512d(r, e); + let r = _mm512_mask_roundscale_pd(a, 0b11111111, a, 0); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_roundscale_pd() { + let a = _mm512_set1_pd(1.1); + let r = _mm512_maskz_roundscale_pd(0, a, 0); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_roundscale_pd(0b11111111, a, 0); + let e = _mm512_set1_pd(1.0); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_pd() { let a = _mm512_set1_pd(10.); From b9800ad39407eeb105fa5cb5f9475ae3af5ed0c7 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 14:07:20 +0000 Subject: [PATCH 03/25] scalef_round: ps,pd; scalef: ps,pd --- crates/core_arch/avx512f.md | 24 +- crates/core_arch/src/x86/avx512f.rs | 373 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 75 +++++ 3 files changed, 460 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 50f8179955..ab9e4d9545 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -613,10 +613,10 @@ * [x] [`_mm512_mask_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=5236) * [x] [`_mm512_mask_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=5236) * [x] [`_mm512_mask_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=5236) - * [ ] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236) - * [ ] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236) - * [ ] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236) - * [ ] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) + * [x] [`_mm512_mask_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=5236) + * [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236) + * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236) + * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) @@ -885,10 +885,10 @@ * [x] [`_mm512_maskz_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=5236) * [x] [`_mm512_maskz_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=5236) * [x] [`_mm512_maskz_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=5236) - * [ ] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236) - * [ ] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236) - * [ ] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236) - * [ ] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236) + * [x] [`_mm512_maskz_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=5236) + * [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236) + * [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236) + * [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236) * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236) * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236) * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236) @@ -1023,10 +1023,10 @@ * [x] [`_mm512_roundscale_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=5236) * [x] [`_mm512_rsqrt14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=5236) * [x] [`_mm512_rsqrt14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=5236) - * [ ] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236) - * [ ] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236) - * [ ] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236) - * [ ] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236) + * [x] [`_mm512_scalef_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=5236) + * [x] [`_mm512_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=5236) + * [x] [`_mm512_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=5236) + * [x] [`_mm512_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=5236) * [x] [`_mm512_set1_epi16`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=5236) * [x] [`_mm512_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi32&expand=5236) * [x] [`_mm512_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 1077f18c44..0a91c92706 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2148,6 +2148,102 @@ pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) -> transmute(r) } +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_ps&expand=4883) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps))] +pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 { + transmute(vscalefps( + a.as_f32x16(), + b.as_f32x16(), + _mm512_setzero_ps().as_f32x16(), + 0b11111111_11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_ps&expand=4881) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps))] +pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 { + transmute(vscalefps( + a.as_f32x16(), + b.as_f32x16(), + src.as_f32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_ps&expand=4882) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps))] +pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 { + transmute(vscalefps( + a.as_f32x16(), + b.as_f32x16(), + _mm512_setzero_ps().as_f32x16(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_pd&expand=4874) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd))] +pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d { + transmute(vscalefpd( + a.as_f64x8(), + b.as_f64x8(), + _mm512_setzero_pd().as_f64x8(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_pd&expand=4872) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd))] +pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + transmute(vscalefpd( + a.as_f64x8(), + b.as_f64x8(), + src.as_f64x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_pd&expand=4873) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd))] +pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d { + transmute(vscalefpd( + a.as_f64x8(), + b.as_f64x8(), + _mm512_setzero_pd().as_f64x8(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -5214,6 +5310,196 @@ pub unsafe fn _mm512_maskz_roundscale_round_pd( transmute(r) } +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_ps&expand=4889) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_scalef_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 { + macro_rules! call { + ($imm4:expr) => { + vscalefps( + a.as_f32x16(), + b.as_f32x16(), + _mm512_setzero_ps().as_f32x16(), + 0b11111111_11111111, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_ps&expand=4887) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_scalef_round_ps( + src: __m512, + k: __mmask16, + a: __m512, + b: __m512, + rounding: i32, +) -> __m512 { + macro_rules! call { + ($imm4:expr) => { + vscalefps(a.as_f32x16(), b.as_f32x16(), src.as_f32x16(), k, $imm4) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_ps&expand=4888) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefps, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_scalef_round_ps( + k: __mmask16, + a: __m512, + b: __m512, + rounding: i32, +) -> __m512 { + macro_rules! call { + ($imm4:expr) => { + vscalefps( + a.as_f32x16(), + b.as_f32x16(), + _mm512_setzero_ps().as_f32x16(), + k, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_pd&expand=4886) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm512_scalef_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d { + macro_rules! call { + ($imm4:expr) => { + vscalefpd( + a.as_f64x8(), + b.as_f64x8(), + _mm512_setzero_pd().as_f64x8(), + 0b11111111, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_pd&expand=4884) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_scalef_round_pd( + src: __m512d, + k: __mmask8, + a: __m512d, + b: __m512d, + rounding: i32, +) -> __m512d { + macro_rules! call { + ($imm4:expr) => { + vscalefpd(a.as_f64x8(), b.as_f64x8(), src.as_f64x8(), k, $imm4) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_pd&expand=4885) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_maskz_scalef_round_pd( + k: __mmask8, + a: __m512d, + b: __m512d, + rounding: i32, +) -> __m512d { + macro_rules! call { + ($imm4:expr) => { + vscalefpd( + a.as_f64x8(), + b.as_f64x8(), + _mm512_setzero_pd().as_f64x8(), + k, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -17579,6 +17865,10 @@ extern "C" { fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16; #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"] fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"] + fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"] + fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8; #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"] fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16; @@ -19678,6 +19968,41 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_scalef_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = _mm512_scalef_ps(a, b); + let e = _mm512_set1_ps(8.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_scalef_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = _mm512_mask_scalef_ps(a, 0, a, b); + assert_eq_m512(r, a); + let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b); + let e = _mm512_set_ps( + 8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_scalef_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = _mm512_maskz_scalef_ps(0, a, b); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b); + let e = _mm512_set_ps( + 8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_ps() { let a = _mm512_set1_ps(10.); @@ -21064,6 +21389,54 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_scalef_round_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = _mm512_scalef_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm512_set1_ps(8.); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_scalef_round_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = + _mm512_mask_scalef_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512(r, a); + let r = _mm512_mask_scalef_round_ps( + a, + 0b11111111_00000000, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_set_ps( + 8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_scalef_round_ps() { + let a = _mm512_set1_ps(1.); + let b = _mm512_set1_ps(3.); + let r = + _mm512_maskz_scalef_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_scalef_round_ps( + 0b11111111_00000000, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_set_ps( + 8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_ps() { let a = _mm512_set1_ps(10.); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index b653a55bcb..f6f80a323c 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -1024,6 +1024,37 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_scalef_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = _mm512_scalef_pd(a, b); + let e = _mm512_set1_pd(8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_scalef_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = _mm512_mask_scalef_pd(a, 0, a, b); + assert_eq_m512d(r, a); + let r = _mm512_mask_scalef_pd(a, 0b11110000, a, b); + let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_scalef_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = _mm512_maskz_scalef_pd(0, a, b); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_scalef_pd(0b11110000, a, b); + let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_pd() { let a = _mm512_set1_pd(10.); @@ -2744,6 +2775,50 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_scalef_round_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = _mm512_scalef_round_pd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm512_set1_pd(8.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_scalef_round_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = + _mm512_mask_scalef_round_pd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512d(r, a); + let r = _mm512_mask_scalef_round_pd( + a, + 0b11110000, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_set_pd(8., 8., 8., 8., 1., 1., 1., 1.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_scalef_round_pd() { + let a = _mm512_set1_pd(1.); + let b = _mm512_set1_pd(3.); + let r = + _mm512_maskz_scalef_round_pd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_scalef_round_pd( + 0b11110000, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm512_set_pd(8., 8., 8., 8., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_pd() { let a = _mm512_set1_pd(10.); From a6274653d84895583c23f9e84ef853144ef59b67 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 15:49:39 +0000 Subject: [PATCH 04/25] reduce_mul: epi64, reduce_max: epi64,epu64, reduce_min: epi64,epu64, reduce_and: epi64, reduce_or: epi64 --- crates/core_arch/avx512f.md | 32 ++-- crates/core_arch/src/x86/avx512f.rs | 178 ++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 105 +++++++++++++ crates/stdarch-verify/tests/x86-intel.rs | 17 ++- 4 files changed, 310 insertions(+), 22 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index ab9e4d9545..4184997d23 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -576,29 +576,29 @@ * [x] [`_mm512_mask_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=5236) * [x] [`_mm512_mask_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=5236) * [x] [`_mm512_mask_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=5236) * [x] [`_mm512_mask_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_pd&expand=5236) * [x] [`_mm512_mask_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_ps&expand=5236) * [x] [`_mm512_mask_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi64&expand=5236) * [x] [`_mm512_mask_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=5236) * [x] [`_mm512_mask_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu32&expand=5236) - * [ ] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236) + * [x] [`_mm512_mask_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=5236) * [x] [`_mm512_mask_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_pd&expand=5236) * [x] [`_mm512_mask_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_ps&expand=5236) * [x] [`_mm512_mask_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=5236) * [x] [`_mm512_mask_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu32&expand=5236) - * [ ] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236) + * [x] [`_mm512_mask_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu64&expand=5236) * [x] [`_mm512_mask_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_pd&expand=5236) * [x] [`_mm512_mask_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_ps&expand=5236) * [x] [`_mm512_mask_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=5236) * [x] [`_mm512_mask_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_pd&expand=5236) * [x] [`_mm512_mask_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_ps&expand=5236) * [x] [`_mm512_mask_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi32&expand=5236) - * [ ] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236) + * [x] [`_mm512_mask_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=5236) * [x] [`_mm512_mask_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=5236) * [x] [`_mm512_mask_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=5236) * [x] [`_mm512_mask_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi32&expand=5236) @@ -986,29 +986,29 @@ * [x] [`_mm512_rcp14_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=5236) * [x] [`_mm512_rcp14_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=5236) * [x] [`_mm512_reduce_add_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=5236) - * [ ] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236) + * [x] [`_mm512_reduce_add_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=5236) * [x] [`_mm512_reduce_add_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=5236) * [x] [`_mm512_reduce_add_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ps&expand=5236) * [x] [`_mm512_reduce_and_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi32&expand=5236) - * [ ] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236) + * [x] [`_mm512_reduce_and_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=5236) * [x] [`_mm512_reduce_max_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi32&expand=5236) - * [ ] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236) + * [x] [`_mm512_reduce_max_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=5236) * [x] [`_mm512_reduce_max_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu32&expand=5236) - * [ ] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236) + * [x] [`_mm512_reduce_max_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=5236) * [x] [`_mm512_reduce_max_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_pd&expand=5236) * [x] [`_mm512_reduce_max_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ps&expand=5236) * [x] [`_mm512_reduce_min_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi32&expand=5236) - * [ ] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236) + * [x] [`_mm512_reduce_min_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=5236) * [x] [`_mm512_reduce_min_epu32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu32&expand=5236) - * [ ] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236) + * [x] [`_mm512_reduce_min_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=5236) * [x] [`_mm512_reduce_min_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_pd&expand=5236) * [x] [`_mm512_reduce_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ps&expand=5236) * [x] [`_mm512_reduce_mul_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi32&expand=5236) - * [ ] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236) + * [x] [`_mm512_reduce_mul_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=5236) * [x] [`_mm512_reduce_mul_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_pd&expand=5236) * [x] [`_mm512_reduce_mul_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ps&expand=5236) * [x] [`_mm512_reduce_or_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi32&expand=5236) - * [ ] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236) + * [x] [`_mm512_reduce_or_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=5236) * [x] [`_mm512_rol_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=5236) * [x] [`_mm512_rol_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=5236) * [x] [`_mm512_rolv_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi32&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 0a91c92706..95c3794b09 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -16843,6 +16843,19 @@ pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 { simd_reduce_add_unordered(a.as_i32x16()) } +/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 { + simd_reduce_add_unordered(simd_select_bitmask( + k, + a.as_i32x16(), + _mm512_setzero_si512().as_i32x16(), + )) +} + /// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_epi64&expand=4558) @@ -16852,16 +16865,16 @@ pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 { simd_reduce_add_unordered(a.as_i64x8()) } -/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a. +/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557) #[inline] #[target_feature(enable = "avx512f")] -pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 { +pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 { simd_reduce_add_unordered(simd_select_bitmask( k, - a.as_i32x16(), - _mm512_setzero_si512().as_i32x16(), + a.as_i64x8(), + _mm512_setzero_si512().as_i64x8(), )) } @@ -16931,6 +16944,28 @@ pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 { )) } +/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_epi64&expand=4602) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 { + simd_reduce_mul_unordered(a.as_i64x8()) +} + +/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_epi64&expand=4601) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 { + simd_reduce_mul_unordered(simd_select_bitmask( + k, + a.as_i64x8(), + _mm512_set1_epi64(1).as_i64x8(), + )) +} + /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_ps&expand=4606) @@ -16997,6 +17032,28 @@ pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 { )) } +/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epi64&expand=4578) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 { + simd_reduce_max(a.as_i64x8()) +} + +/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epi64&expand=4577) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 { + simd_reduce_max(simd_select_bitmask( + k, + a.as_i64x8(), + _mm512_set1_epi64(0).as_i64x8(), + )) +} + /// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu32&expand=4580) @@ -17019,6 +17076,28 @@ pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 { )) } +/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu64&expand=4582) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 { + simd_reduce_max(a.as_u64x8()) +} + +/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epu64&expand=4581) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 { + simd_reduce_max(simd_select_bitmask( + k, + a.as_u64x8(), + _mm512_set1_epi64(0).as_u64x8(), + )) +} + /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_ps&expand=4586) @@ -17085,6 +17164,28 @@ pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 { )) } +/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epi64&expand=4590) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 { + simd_reduce_min(a.as_i64x8()) +} + +/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 { + simd_reduce_min(simd_select_bitmask( + k, + a.as_i64x8(), + _mm512_set1_epi64(0).as_i64x8(), + )) +} + /// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu32&expand=4592) @@ -17107,6 +17208,28 @@ pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 { )) } +/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu64&expand=4594) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 { + simd_reduce_min(a.as_u64x8()) +} + +/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 { + simd_reduce_min(simd_select_bitmask( + k, + a.as_u64x8(), + _mm512_set1_epi64(0).as_u64x8(), + )) +} + /// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_ps&expand=4598) @@ -17191,6 +17314,29 @@ pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 { )) } +/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_and_epi64&expand=4566) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 { + simd_reduce_and(a.as_i64x8()) +} + +/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 { + simd_reduce_and(simd_select_bitmask( + k, + a.as_i64x8(), + _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7) + .as_i64x8(), + )) +} + /// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi32&expand=4608) @@ -17213,6 +17359,28 @@ pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 { )) } +/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi64&expand=4610) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 { + simd_reduce_or(a.as_i64x8()) +} + +/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_or_epi64&expand=4609) +#[inline] +#[target_feature(enable = "avx512f")] +pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 { + simd_reduce_or(simd_select_bitmask( + k, + a.as_i64x8(), + _mm512_setzero_si512().as_i64x8(), + )) +} + /// Returns vector of type `__m512d` with undefined elements. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd) diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index f6f80a323c..92c300ca17 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -5814,6 +5814,13 @@ mod tests { assert_eq!(8, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_add_epi64() { + let a = _mm512_set1_epi64(1); + let e: i64 = _mm512_mask_reduce_add_epi64(0b11110000, a); + assert_eq!(4, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_add_pd() { let a = _mm512_set1_pd(1.); @@ -5828,6 +5835,20 @@ mod tests { assert_eq!(4., e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_mul_epi64() { + let a = _mm512_set1_epi64(2); + let e: i64 = _mm512_reduce_mul_epi64(a); + assert_eq!(256, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_mul_epi64() { + let a = _mm512_set1_epi64(2); + let e: i64 = _mm512_mask_reduce_mul_epi64(0b11110000, a); + assert_eq!(16, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_mul_pd() { let a = _mm512_set1_pd(2.); @@ -5842,6 +5863,34 @@ mod tests { assert_eq!(16., e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_max_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: i64 = _mm512_reduce_max_epi64(a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_max_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: i64 = _mm512_mask_reduce_max_epi64(0b11110000, a); + assert_eq!(3, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_max_epu64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: u64 = _mm512_reduce_max_epu64(a); + assert_eq!(7, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_max_epu64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: u64 = _mm512_mask_reduce_max_epu64(0b11110000, a); + assert_eq!(3, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_max_pd() { let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); @@ -5856,6 +5905,34 @@ mod tests { assert_eq!(3., e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_min_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: i64 = _mm512_reduce_min_epi64(a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_min_epi64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: i64 = _mm512_mask_reduce_min_epi64(0b11110000, a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_min_epu64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: u64 = _mm512_reduce_min_epu64(a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_min_epu64() { + let a = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + let e: u64 = _mm512_mask_reduce_min_epu64(0b11110000, a); + assert_eq!(0, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_min_pd() { let a = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.); @@ -5870,6 +5947,34 @@ mod tests { assert_eq!(0., e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_and_epi64() { + let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2); + let e: i64 = _mm512_reduce_and_epi64(a); + assert_eq!(0, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_and_epi64() { + let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2); + let e: i64 = _mm512_mask_reduce_and_epi64(0b11110000, a); + assert_eq!(1, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_reduce_or_epi64() { + let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2); + let e: i64 = _mm512_reduce_or_epi64(a); + assert_eq!(3, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_reduce_or_epi64() { + let a = _mm512_set_epi64(1, 1, 1, 1, 2, 2, 2, 2); + let e: i64 = _mm512_mask_reduce_or_epi64(0b11110000, a); + assert_eq!(1, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_extractf64x4_pd() { let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.); diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index b9aba9461a..d5096a6903 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -577,7 +577,22 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { | "_mm512_setr4_epi64" | "_mm512_set_epi64" | "_mm512_setr_epi64" - | "_mm512_reduce_add_epi64" => true, + | "_mm512_reduce_add_epi64" + | "_mm512_mask_reduce_add_epi64" + | "_mm512_reduce_mul_epi64" + | "_mm512_mask_reduce_mul_epi64" + | "_mm512_reduce_max_epi64" + | "_mm512_mask_reduce_max_epi64" + | "_mm512_reduce_max_epu64" + | "_mm512_mask_reduce_max_epu64" + | "_mm512_reduce_min_epi64" + | "_mm512_mask_reduce_min_epi64" + | "_mm512_reduce_min_epu64" + | "_mm512_mask_reduce_min_epu64" + | "_mm512_reduce_and_epi64" + | "_mm512_mask_reduce_and_epi64" + | "_mm512_reduce_or_epi64" + | "_mm512_mask_reduce_or_epi64" => true, // These return a 64-bit argument but they're assembled from other // 32-bit registers, so these work on 32-bit just fine. See #308 for From 1f662e9c324a330b44b5af2080ef4037e890cc0b Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 17:41:25 +0000 Subject: [PATCH 05/25] fixupimm_round_ps --- crates/core_arch/avx512f.md | 2 +- crates/core_arch/src/x86/avx512f.rs | 176 ++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+), 1 deletion(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 4184997d23..742fd7bda1 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -162,7 +162,7 @@ * [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236) * [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236) * [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236) - * [ ] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236) + * [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236) * [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236) * [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236) * [x] [`_mm512_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 95c3794b09..38b3177e56 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -5500,6 +5500,95 @@ pub unsafe fn _mm512_maskz_scalef_round_pd( transmute(r) } +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_ps&expand=2505) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm512_fixupimm_round_ps( + a: __m512, + b: __m512, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmps( + a.as_f32x16(), + b.as_f32x16(), + c.as_i32x16(), + $imm8, + 0b11111111_11111111, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_ps&expand=2506) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm512_mask_fixupimm_round_ps( + a: __m512, + k: __mmask16, + b: __m512, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmps(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_ps&expand=2507) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm512_maskz_fixupimm_round_ps( + k: __mmask16, + a: __m512, + b: __m512, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmps( + a.as_f32x16(), + b.as_f32x16(), + c.as_i32x16(), + $imm8, + 0b11111111_11111111, + $imm4, + ) + }; + } + let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -18038,6 +18127,11 @@ extern "C" { #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"] fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"] + fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16; + #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"] + fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"] fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16; #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"] @@ -21605,6 +21699,88 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_fixupimm_round_ps() { + let a = _mm512_set1_ps(f32::NAN); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_fixupimm_round_ps(a, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_ps(0.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_fixupimm_round_ps() { + let a = _mm512_set_ps( + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + 1., + 1., + 1., + 1., + 1., + 1., + 1., + 1., + ); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_mask_fixupimm_round_ps( + a, + 0b11111111_00000000, + b, + c, + 5, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_fixupimm_round_ps() { + let a = _mm512_set_ps( + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + 1., + 1., + 1., + 1., + 1., + 1., + 1., + 1., + ); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_maskz_fixupimm_round_ps( + 0b11111111_00000000, + a, + b, + c, + 5, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_ps() { let a = _mm512_set1_ps(10.); From 942481b419976f19dbb61ba9c86c2b2b742a1d88 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 20:19:48 +0000 Subject: [PATCH 06/25] fixupimm_round_pd --- crates/core_arch/avx512f.md | 2 +- crates/core_arch/src/x86/avx512f.rs | 93 +++++++++++++++++++++++--- crates/core_arch/src/x86_64/avx512f.rs | 20 ++++++ 3 files changed, 106 insertions(+), 9 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 742fd7bda1..ab7519d643 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -778,7 +778,7 @@ * [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236) * [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236) * [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236) - * [ ] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236) + * [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236) * [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236) * [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236) * [x] [`_mm512_maskz_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 38b3177e56..a08845476d 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -5574,19 +5574,92 @@ pub unsafe fn _mm512_maskz_fixupimm_round_ps( ) -> __m512 { macro_rules! call { ($imm8:expr, $imm4:expr) => { - vfixupimmps( - a.as_f32x16(), - b.as_f32x16(), - c.as_i32x16(), + vfixupimmpsz(a.as_f32x16(), b.as_f32x16(), c.as_i32x16(), $imm8, k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_pd&expand=2502) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm512_fixupimm_round_pd( + a: __m512d, + b: __m512d, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmpd( + a.as_f64x8(), + b.as_f64x8(), + c.as_i64x8(), $imm8, - 0b11111111_11111111, + 0b11111111, $imm4, ) }; } - let r: f32x16 = constify_imm8_roundscale!(imm8, sae, call); - let zero = _mm512_setzero_ps().as_f32x16(); - transmute(simd_select_bitmask(k, r, zero)) + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_pd&expand=2503) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm512_mask_fixupimm_round_pd( + a: __m512d, + k: __mmask8, + b: __m512d, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmpd(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_pd&expand=2504) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm512_maskz_fixupimm_round_pd( + k: __mmask8, + a: __m512d, + b: __m512d, + c: __m512i, + imm8: i32, + sae: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vfixupimmpdz(a.as_f64x8(), b.as_f64x8(), c.as_i64x8(), $imm8, k, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) } /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. @@ -18131,6 +18204,10 @@ extern "C" { fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16; #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"] fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"] + fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16; + #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"] + fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8; #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"] fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16; diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 92c300ca17..bb359f1e38 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -1055,6 +1055,26 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_fixupimm_round_pd() { + let a = _mm512_set1_pd(f64::NAN); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(0.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_fixupimm_round_pd() { + let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_pd() { let a = _mm512_set1_pd(10.); From 66e2d9b59d18fd6fe8cb533bb9fc603f2a92114c Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 22:35:17 +0000 Subject: [PATCH 07/25] fixupimm: ps,pd --- crates/core_arch/avx512f.md | 20 +-- crates/core_arch/src/x86/avx512f.rs | 236 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 48 ++++- 3 files changed, 290 insertions(+), 14 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index ab7519d643..eafcd9291b 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -159,9 +159,9 @@ * [x] [`_mm512_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=5236) * [x] [`_mm512_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=5236) * [x] [`_mm512_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=5236) - * [ ] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236) - * [ ] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236) - * [ ] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236) + * [x] [`_mm512_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=5236) + * [x] [`_mm512_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=5236) + * [x] [`_mm512_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=5236) * [x] [`_mm512_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=5236) * [x] [`_mm512_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=5236) * [x] [`_mm512_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=5236) @@ -450,10 +450,10 @@ * [x] [`_mm512_mask_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf64x4_pd&expand=5236) * [x] [`_mm512_mask_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=5236) * [x] [`_mm512_mask_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=5236) - * [ ] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236) - * [ ] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236) - * [ ] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236) - * [ ] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236) + * [x] [`_mm512_mask_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=5236) + * [x] [`_mm512_mask_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=5236) + * [x] [`_mm512_mask_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=5236) + * [x] [`_mm512_mask_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=5236) * [x] [`_mm512_mask_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pd&expand=5236) * [x] [`_mm512_mask_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ps&expand=5236) * [x] [`_mm512_mask_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pd&expand=5236) @@ -775,9 +775,9 @@ * [x] [`_mm512_maskz_extractf64x4_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf64x4_pd&expand=5236) * [x] [`_mm512_maskz_extracti32x4_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=5236) * [x] [`_mm512_maskz_extracti64x4_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=5236) - * [ ] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236) - * [ ] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236) - * [ ] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236) + * [x] [`_mm512_maskz_fixupimm_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=5236) + * [x] [`_mm512_maskz_fixupimm_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=5236) + * [x] [`_mm512_maskz_fixupimm_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=5236) * [x] [`_mm512_maskz_fixupimm_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=5236) * [x] [`_mm512_maskz_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=5236) * [x] [`_mm512_maskz_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a08845476d..164630ff51 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2244,6 +2244,174 @@ pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m )) } +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_ps&expand=2499) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_fixupimm_ps(a: __m512, b: __m512, c: __m512i, imm8: i32) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vfixupimmps( + a.as_f32x16(), + b.as_f32x16(), + c.as_i32x16(), + $imm8, + 0b11111111_11111111, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_ps&expand=2500) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_fixupimm_ps( + a: __m512, + k: __mmask16, + b: __m512, + c: __m512i, + imm8: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vfixupimmps( + a.as_f32x16(), + b.as_f32x16(), + c.as_i32x16(), + $imm8, + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_ps&expand=2501) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_maskz_fixupimm_ps( + k: __mmask16, + a: __m512, + b: __m512, + c: __m512i, + imm8: i32, +) -> __m512 { + macro_rules! call { + ($imm8:expr) => { + vfixupimmpsz( + a.as_f32x16(), + b.as_f32x16(), + c.as_i32x16(), + $imm8, + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_pd&expand=2490) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_fixupimm_pd(a: __m512d, b: __m512d, c: __m512i, imm8: i32) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vfixupimmpd( + a.as_f64x8(), + b.as_f64x8(), + c.as_i64x8(), + $imm8, + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_pd&expand=2491) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_fixupimm_pd( + a: __m512d, + k: __mmask8, + b: __m512d, + c: __m512i, + imm8: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vfixupimmpd( + a.as_f64x8(), + b.as_f64x8(), + c.as_i64x8(), + $imm8, + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_pd&expand=2492) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_maskz_fixupimm_pd( + k: __mmask8, + a: __m512d, + b: __m512d, + c: __m512i, + imm8: i32, +) -> __m512d { + macro_rules! call { + ($imm8:expr) => { + vfixupimmpdz( + a.as_f64x8(), + b.as_f64x8(), + c.as_i64x8(), + $imm8, + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -20342,6 +20510,74 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_fixupimm_ps() { + let a = _mm512_set1_ps(f32::NAN); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_fixupimm_ps(a, b, c, 5); + let e = _mm512_set1_ps(0.0); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_fixupimm_ps() { + let a = _mm512_set_ps( + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + 1., + 1., + 1., + 1., + 1., + 1., + 1., + 1., + ); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_mask_fixupimm_ps(a, 0b11111111_00000000, b, c, 5); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., + ); + assert_eq_m512(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_fixupimm_ps() { + let a = _mm512_set_ps( + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + f32::NAN, + 1., + 1., + 1., + 1., + 1., + 1., + 1., + 1., + ); + let b = _mm512_set1_ps(f32::MAX); + let c = _mm512_set1_epi32(i32::MAX); + let r = _mm512_maskz_fixupimm_ps(0b11111111_00000000, a, b, c, 5); + let e = _mm512_set_ps( + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., + ); + assert_eq_m512(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_ps() { let a = _mm512_set1_ps(10.); diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index bb359f1e38..734470a9bb 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -1056,25 +1056,35 @@ mod tests { } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_fixupimm_round_pd() { + unsafe fn test_mm512_fixupimm_pd() { let a = _mm512_set1_pd(f64::NAN); let b = _mm512_set1_pd(f64::MAX); let c = _mm512_set1_epi64(i32::MAX as i64); - let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let r = _mm512_fixupimm_pd(a, b, c, 5); let e = _mm512_set1_pd(0.0); assert_eq_m512d(r, e); } #[simd_test(enable = "avx512f")] - unsafe fn test_mm512_mask_fixupimm_round_pd() { + unsafe fn test_mm512_mask_fixupimm_pd() { let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); let b = _mm512_set1_pd(f64::MAX); let c = _mm512_set1_epi64(i32::MAX as i64); - let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let r = _mm512_mask_fixupimm_pd(a, 0b11110000, b, c, 5); let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.); assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_fixupimm_pd() { + let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_maskz_fixupimm_pd(0b11110000, a, b, c, 5); + let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_pd() { let a = _mm512_set1_pd(10.); @@ -2839,6 +2849,36 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_fixupimm_round_pd() { + let a = _mm512_set1_pd(f64::NAN); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_fixupimm_round_pd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set1_pd(0.0); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_fixupimm_round_pd() { + let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_mask_fixupimm_round_pd(a, 0b11110000, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set_pd(0., 0., 0., 0., 1., 1., 1., 1.); + assert_eq_m512d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_fixupimm_round_pd() { + let a = _mm512_set_pd(f64::NAN, f64::NAN, f64::NAN, f64::NAN, 1., 1., 1., 1.); + let b = _mm512_set1_pd(f64::MAX); + let c = _mm512_set1_epi64(i32::MAX as i64); + let r = _mm512_maskz_fixupimm_round_pd(0b11110000, a, b, c, 5, _MM_FROUND_CUR_DIRECTION); + let e = _mm512_set_pd(0., 0., 0., 0., 0., 0., 0., 0.); + assert_eq_m512d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_round_pd() { let a = _mm512_set1_pd(10.); From aa32a1f233dbba9b789388ce1f1cfa2ae602a427 Mon Sep 17 00:00:00 2001 From: jironglin Date: Sun, 11 Oct 2020 23:51:44 +0000 Subject: [PATCH 08/25] ternarylogic_epi32 --- crates/core_arch/avx512f.md | 6 +- crates/core_arch/src/x86/avx512f.rs | 103 ++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index eafcd9291b..055e3df356 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -662,7 +662,7 @@ * [x] [`_mm512_mask_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ps&expand=5236) * [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236) * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236) - * [ ] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236) + * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236) * [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236) * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236) * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236) @@ -926,7 +926,7 @@ * [x] [`_mm512_maskz_sub_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ps&expand=5236) * [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236) * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236) - * [ ] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236) + * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236) * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236) * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236) * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236) @@ -1106,7 +1106,7 @@ * [x] [`_mm512_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_pd&expand=5236) * [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236) * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236) - * [ ] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236) + * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236) * [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236) * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236) * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 164630ff51..080de6a1cc 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2412,6 +2412,70 @@ pub unsafe fn _mm512_maskz_fixupimm_pd( transmute(r) } +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi32&expand=5867) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_ternarylogic_epi32(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi32&expand=5865) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_ternarylogic_epi32( + src: __m512i, + k: __mmask16, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogd(src.as_i32x16(), a.as_i32x16(), b.as_i32x16(), $imm8) + }; + } + let ternarylogic = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, ternarylogic, src.as_i32x16())) +} + +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi32&expand=5866) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_maskz_ternarylogic_epi32( + k: __mmask16, + a: __m512i, + b: __m512i, + c: __m512i, + imm8: i32, +) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16(), $imm8) + }; + } + let ternarylogic = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, ternarylogic, zero)) +} + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -18377,6 +18441,11 @@ extern "C" { #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"] fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8; + #[link_name = "llvm.x86.avx512.pternlog.d.512"] + fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, sae: i32) -> i32x16; + #[link_name = "llvm.x86.avx512.pternlog.q.512"] + fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, sae: i32) -> i64x8; + #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"] fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16; #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"] @@ -20578,6 +20647,40 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_ternarylogic_epi32() { + let a = _mm512_set1_epi32(1 << 2); + let b = _mm512_set1_epi32(1 << 1); + let c = _mm512_set1_epi32(1 << 0); + let r = _mm512_ternarylogic_epi32(a, b, c, 8); + let e = _mm512_set1_epi32(0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_ternarylogic_epi32() { + let src = _mm512_set1_epi32(1 << 2); + let a = _mm512_set1_epi32(1 << 1); + let b = _mm512_set1_epi32(1 << 0); + let r = _mm512_mask_ternarylogic_epi32(src, 0, a, b, 8); + assert_eq_m512i(r, src); + let r = _mm512_mask_ternarylogic_epi32(src, 0b11111111_11111111, a, b, 8); + let e = _mm512_set1_epi32(0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_ternarylogic_epi32() { + let a = _mm512_set1_epi32(1 << 2); + let b = _mm512_set1_epi32(1 << 1); + let c = _mm512_set1_epi32(1 << 0); + let r = _mm512_maskz_ternarylogic_epi32(0, a, b, c, 9); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_ternarylogic_epi32(0b11111111_11111111, a, b, c, 8); + let e = _mm512_set1_epi32(0); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_ps() { let a = _mm512_set1_ps(10.); From a25eaf3022bf1f9283a44a7a2c5b5caaf2c31680 Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 12 Oct 2020 00:03:59 +0000 Subject: [PATCH 09/25] ternarylogic_epi64 --- crates/core_arch/avx512f.md | 6 +-- crates/core_arch/src/x86/avx512f.rs | 66 ++++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 34 +++++++++++++ 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 055e3df356..443adee4b9 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -663,7 +663,7 @@ * [x] [`_mm512_mask_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5236) * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236) * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236) - * [ ] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236) + * [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236) * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236) * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236) * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236) @@ -927,7 +927,7 @@ * [x] [`_mm512_maskz_sub_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5236) * [x] [`_mm512_maskz_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5236) * [x] [`_mm512_maskz_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5236) - * [ ] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236) + * [x] [`_mm512_maskz_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5236) * [x] [`_mm512_maskz_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=5236) * [x] [`_mm512_maskz_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=5236) * [x] [`_mm512_maskz_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=5236) @@ -1107,7 +1107,7 @@ * [x] [`_mm512_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5236) * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236) * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236) - * [ ] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236) + * [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236) * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236) * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236) * [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 080de6a1cc..c7156b6d4c 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2476,6 +2476,72 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi32( transmute(simd_select_bitmask(k, ternarylogic, zero)) } +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi64&expand=5876) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm512_ternarylogic_epi64(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi64&expand=5874) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_mask_ternarylogic_epi64( + src: __m512i, + k: __mmask8, + a: __m512i, + b: __m512i, + imm8: i32, +) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogq(src.as_i64x8(), a.as_i64x8(), b.as_i64x8(), $imm8) + }; + } + let ternarylogic = constify_imm8_sae!(imm8, call); + transmute(simd_select_bitmask(k, ternarylogic, src.as_i64x8())) +} + +/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi64&expand=5875) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm512_maskz_ternarylogic_epi64( + k: __mmask8, + a: __m512i, + b: __m512i, + c: __m512i, + imm8: i32, +) -> __m512i { + macro_rules! call { + ($imm8:expr) => { + vpternlogq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8(), $imm8) + }; + } + let ternarylogic = constify_imm8_sae!(imm8, call); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, ternarylogic, zero)) +} + +/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. + /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 734470a9bb..3f7d67dc04 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -1085,6 +1085,40 @@ mod tests { assert_eq_m512d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_ternarylogic_epi64() { + let a = _mm512_set1_epi64(1 << 2); + let b = _mm512_set1_epi64(1 << 1); + let c = _mm512_set1_epi64(1 << 0); + let r = _mm512_ternarylogic_epi64(a, b, c, 8); + let e = _mm512_set1_epi64(0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_ternarylogic_epi64() { + let src = _mm512_set1_epi64(1 << 2); + let a = _mm512_set1_epi64(1 << 1); + let b = _mm512_set1_epi64(1 << 0); + let r = _mm512_mask_ternarylogic_epi64(src, 0, a, b, 8); + assert_eq_m512i(r, src); + let r = _mm512_mask_ternarylogic_epi64(src, 0b11111111, a, b, 8); + let e = _mm512_set1_epi64(0); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_ternarylogic_epi64() { + let a = _mm512_set1_epi64(1 << 2); + let b = _mm512_set1_epi64(1 << 1); + let c = _mm512_set1_epi64(1 << 0); + let r = _mm512_maskz_ternarylogic_epi64(0, a, b, c, 9); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_ternarylogic_epi64(0b11111111, a, b, c, 8); + let e = _mm512_set1_epi64(0); + assert_eq_m512i(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_getmant_pd() { let a = _mm512_set1_pd(10.); From 7624f5bfa772518bcf4ab8edd2e0fe236a4d4765 Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 12 Oct 2020 21:16:37 +0000 Subject: [PATCH 10/25] int2mask, mask2int, stream: ps,pd,si512 --- crates/core_arch/avx512f.md | 10 +-- crates/core_arch/src/x86/avx512f.rs | 86 ++++++++++++++++++++++++++ crates/core_arch/src/x86/test.rs | 15 +++++ crates/core_arch/src/x86_64/avx512f.rs | 30 +++++++++ 4 files changed, 136 insertions(+), 5 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 443adee4b9..184fb92aa8 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -226,7 +226,7 @@ * [x] [`_mm512_insertf64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=5236) * [x] [`_mm512_inserti32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=5236) * [x] [`_mm512_inserti64x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=5236) - * [ ] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236) + * [x] [`_mm512_int2mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_int2mask&expand=5236) * [x] [`_mm512_kand`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kand&expand=5236) * [x] [`_mm512_kandn`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kandn&expand=5236) * [x] [`_mm512_kmov`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_kmov&expand=5236) @@ -251,7 +251,7 @@ * [x] [`_mm512_mask2_permutex2var_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=5236) * [x] [`_mm512_mask2_permutex2var_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=5236) * [x] [`_mm512_mask2_permutex2var_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=5236) - * [ ] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236) + * [x] [`_mm512_mask2int`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2int&expand=5236) * [x] [`_mm512_mask3_fmadd_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=5236) * [x] [`_mm512_mask3_fmadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=5236) * [x] [`_mm512_mask3_fmadd_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pd&expand=5236) @@ -1096,9 +1096,9 @@ * [x] [`_mm512_storeu_epi64`] * [x] [`_mm512_storeu_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_si512&expand=5236) * [ ] [`_mm512_stream_load_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_load_si512&expand=5236) - * [ ] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236) - * [ ] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236) - * [ ] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236) + * [x] [`_mm512_stream_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5236) + * [x] [`_mm512_stream_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5236) + * [x] [`_mm512_stream_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5236) * [x] [`_mm512_sub_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi32&expand=5236) * [x] [`_mm512_sub_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi64&expand=5236) * [x] [`_mm512_sub_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_pd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index c7156b6d4c..c5bfc267db 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -15812,6 +15812,61 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 { transmute(r) } +/// Converts integer mask into bitmask, storing the result in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189) +#[inline] +#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw +pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 { + assert!(mask >= 0); + let r: u16 = mask as u16; + transmute(r) +} + +/// Converts bit mask k1 into an integer value, storing the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544) +#[inline] +#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw +#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw +pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 { + let r: i32 = k1 as i32; + transmute(r) +} + +/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovntps))] +#[allow(clippy::cast_ptr_alignment)] +pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) { + intrinsics::nontemporal_store(mem_addr as *mut __m512, a); +} + +/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_pd&expand=5667) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd +#[allow(clippy::cast_ptr_alignment)] +pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) { + intrinsics::nontemporal_store(mem_addr as *mut __m512d, a); +} + +/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_si512&expand=5675) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq +#[allow(clippy::cast_ptr_alignment)] +pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) { + intrinsics::nontemporal_store(mem_addr as *mut __m512i, a); +} + /// Sets packed 32-bit integers in `dst` with the supplied values. /// /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps) @@ -27213,6 +27268,37 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_int2mask() { + let a: i32 = 0b11001100_00110011; + let r = _mm512_int2mask(a); + let e: u16 = 0b11001100_00110011; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask2int() { + let k1: __mmask16 = 0b11001100_00110011; + let r = _mm512_mask2int(k1); + let e: i32 = 0b11001100_00110011; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_stream_ps() { + #[repr(align(32))] + struct Memory { + pub data: [f32; 16], + } + let a = _mm512_set1_ps(7.0); + let mut mem = Memory { data: [-1.0; 16] }; + + _mm512_stream_ps(&mut mem.data[0] as *mut f32, a); + for i in 0..16 { + assert_eq!(mem.data[i], get_m512(a, i)); + } + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_reduce_add_epi32() { let a = _mm512_set1_epi32(1); diff --git a/crates/core_arch/src/x86/test.rs b/crates/core_arch/src/x86/test.rs index 594f680b73..9014d66fd0 100644 --- a/crates/core_arch/src/x86/test.rs +++ b/crates/core_arch/src/x86/test.rs @@ -71,6 +71,21 @@ pub unsafe fn get_m256(a: __m256, idx: usize) -> f32 { transmute::<_, [f32; 8]>(a)[idx] } +#[target_feature(enable = "avx512f")] +pub unsafe fn get_m512(a: __m512, idx: usize) -> f32 { + transmute::<_, [f32; 16]>(a)[idx] +} + +#[target_feature(enable = "avx512f")] +pub unsafe fn get_m512d(a: __m512d, idx: usize) -> f64 { + transmute::<_, [f64; 8]>(a)[idx] +} + +#[target_feature(enable = "avx512f")] +pub unsafe fn get_m512i(a: __m512i, idx: usize) -> i64 { + transmute::<_, [i64; 8]>(a)[idx] +} + // These intrinsics doesn't exist on x86 b/c it requires a 64-bit register, // which doesn't exist on x86! #[cfg(target_arch = "x86")] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 3f7d67dc04..3d5b22c64b 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -6257,4 +6257,34 @@ mod tests { _mm512_store_pd(&mut r as *mut _ as *mut f64, a); assert_eq_m512d(r, a); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_stream_pd() { + #[repr(align(64))] + struct Memory { + pub data: [f64; 8], + } + let a = _mm512_set1_pd(7.0); + let mut mem = Memory { data: [-1.0; 8] }; + + _mm512_stream_pd(&mut mem.data[0] as *mut f64, a); + for i in 0..8 { + assert_eq!(mem.data[i], get_m512d(a, i)); + } + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_stream_si512() { + #[repr(align(64))] + struct Memory { + pub data: [i64; 8], + } + let a = _mm512_set1_epi64(7); + let mut mem = Memory { data: [-1; 8] }; + + _mm512_stream_si512(&mut mem.data[0] as *mut i64, a); + for i in 0..8 { + assert_eq!(mem.data[i], get_m512i(a, i)); + } + } } From c0051783d8bb30d5f301d1d6fc2f694a91ff113f Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 12 Oct 2020 22:03:55 +0000 Subject: [PATCH 11/25] mask_set1: epi32,epi64, maskz_set1: epi32,epi64 --- crates/core_arch/avx512f.md | 6 +-- crates/core_arch/src/x86/avx512f.rs | 67 ++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 21 ++++++++ crates/stdarch-verify/tests/x86-intel.rs | 4 +- 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 184fb92aa8..2534fb7a5f 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -617,7 +617,7 @@ * [x] [`_mm512_mask_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=5236) * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236) * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) - * [ ] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) + * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236) @@ -889,8 +889,8 @@ * [x] [`_mm512_maskz_scalef_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=5236) * [x] [`_mm512_maskz_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=5236) * [x] [`_mm512_maskz_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=5236) - * [ ] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236) - * [ ] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236) + * [x] [`_mm512_maskz_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=5236) + * [x] [`_mm512_maskz_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=5236) * [x] [`_mm512_maskz_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5236) * [x] [`_mm512_maskz_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32x4&expand=5236) * [x] [`_mm512_maskz_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index c5bfc267db..18b0568f08 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -15990,6 +15990,29 @@ pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i { transmute(i32x16::splat(a)) } +/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi32&expand=4951) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcastd))] +pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i { + let r = _mm512_set1_epi32(a).as_i32x16(); + transmute(simd_select_bitmask(k, r, src.as_i32x16())) +} + +/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi32&expand=4952) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcastd))] +pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i { + let r = _mm512_set1_epi32(a).as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, r, zero)) +} + /// Broadcast 64-bit integer `a` to all elements of `dst`. #[inline] #[target_feature(enable = "avx512f")] @@ -15997,6 +16020,29 @@ pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i { transmute(i64x8::splat(a)) } +/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi64&expand=4959) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcastq))] +pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i { + let r = _mm512_set1_epi64(a).as_i64x8(); + transmute(simd_select_bitmask(k, r, src.as_i64x8())) +} + +/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi64&expand=4960) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vpbroadcastq))] +pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i { + let r = _mm512_set1_epi64(a).as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, r, zero)) +} + /// Set packed 64-bit integers in dst with the repeated 4 element sequence. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_epi64&expand=4983) @@ -27669,4 +27715,25 @@ mod tests { _mm512_store_ps(&mut r as *mut _ as *mut f32, a); assert_eq_m512(r, a); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_set1_epi32() { + let src = _mm512_set1_epi32(2); + let a: i32 = 11; + let r = _mm512_mask_set1_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a); + let e = _mm512_set1_epi32(11); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_set1_epi32() { + let a: i32 = 11; + let r = _mm512_maskz_set1_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a); + let e = _mm512_set1_epi32(11); + assert_eq_m512i(r, e); + } } diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 3d5b22c64b..3dbbc77a89 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -6287,4 +6287,25 @@ mod tests { assert_eq!(mem.data[i], get_m512i(a, i)); } } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_set1_epi64() { + let src = _mm512_set1_epi64(2); + let a: i64 = 11; + let r = _mm512_mask_set1_epi64(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_set1_epi64(src, 0b11111111, a); + let e = _mm512_set1_epi64(11); + assert_eq_m512i(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_set1_epi64() { + let a: i64 = 11; + let r = _mm512_maskz_set1_epi64(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_set1_epi64(0b11111111, a); + let e = _mm512_set1_epi64(11); + assert_eq_m512i(r, e); + } } diff --git a/crates/stdarch-verify/tests/x86-intel.rs b/crates/stdarch-verify/tests/x86-intel.rs index d5096a6903..0e44f73a7f 100644 --- a/crates/stdarch-verify/tests/x86-intel.rs +++ b/crates/stdarch-verify/tests/x86-intel.rs @@ -592,7 +592,9 @@ fn matches(rust: &Function, intel: &Intrinsic) -> Result<(), String> { | "_mm512_reduce_and_epi64" | "_mm512_mask_reduce_and_epi64" | "_mm512_reduce_or_epi64" - | "_mm512_mask_reduce_or_epi64" => true, + | "_mm512_mask_reduce_or_epi64" + | "_mm512_mask_set1_epi64" + | "_mm512_maskz_set1_epi64" => true, // These return a 64-bit argument but they're assembled from other // 32-bit registers, so these work on 32-bit just fine. See #308 for From 4091be50ab71f34ae4f44813f40ecddf9e9315a2 Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 12 Oct 2020 23:03:04 +0000 Subject: [PATCH 12/25] test_epi32_mask, test_epi64_mask, testn_epi32_mask, testn_epi64_mask --- crates/core_arch/avx512f.md | 16 +-- crates/core_arch/src/x86/avx512f.rs | 138 ++++++++++++++++++++++++- crates/core_arch/src/x86_64/avx512f.rs | 40 +++++++ 3 files changed, 185 insertions(+), 9 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 2534fb7a5f..43b2597510 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -664,10 +664,10 @@ * [x] [`_mm512_mask_sub_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5236) * [x] [`_mm512_mask_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5236) * [x] [`_mm512_mask_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5236) - * [ ] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236) - * [ ] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236) - * [ ] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236) - * [ ] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236) + * [x] [`_mm512_mask_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5236) + * [x] [`_mm512_mask_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5236) + * [x] [`_mm512_mask_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5236) + * [x] [`_mm512_mask_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5236) * [x] [`_mm512_mask_unpackhi_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=5236) * [x] [`_mm512_mask_unpackhi_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=5236) * [x] [`_mm512_mask_unpackhi_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=5236) @@ -1108,10 +1108,10 @@ * [ ] [`_mm512_svml_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_svml_round_pd&expand=5236) * [x] [`_mm512_ternarylogic_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5236) * [x] [`_mm512_ternarylogic_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5236) - * [ ] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236) - * [ ] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236) - * [ ] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236) - * [ ] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236) + * [x] [`_mm512_test_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5236) + * [x] [`_mm512_test_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5236) + * [x] [`_mm512_testn_epi32_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5236) + * [x] [`_mm512_testn_epi64_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5236) * [x] [`_mm512_undefined_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_epi32&expand=5236) * [x] [`_mm512_undefined_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd&expand=5236) * [x] [`_mm512_undefined_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 18b0568f08..1ef726fba5 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -15827,13 +15827,109 @@ pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 { /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544) #[inline] -#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw +#[target_feature(enable = "avx512f")] #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 { let r: i32 = k1 as i32; transmute(r) } +/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi32_mask&expand=5890) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestmd))] +pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + let and = _mm512_and_epi32(a, b); + let zero = _mm512_setzero_si512(); + _mm512_cmpneq_epi32_mask(and, zero) +} + +/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi32_mask&expand=5889) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestmd))] +pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + let and = _mm512_and_epi32(a, b); + let zero = _mm512_setzero_si512(); + _mm512_mask_cmpneq_epi32_mask(k, and, zero) +} + +/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi64_mask&expand=5896) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestmq))] +pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + let and = _mm512_and_epi64(a, b); + let zero = _mm512_setzero_si512(); + _mm512_cmpneq_epi64_mask(and, zero) +} + +/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi64_mask&expand=5895) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestmq))] +pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + let and = _mm512_and_epi64(a, b); + let zero = _mm512_setzero_si512(); + _mm512_mask_cmpneq_epi64_mask(k, and, zero) +} + +/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi32_mask&expand=5921) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestnmd))] +pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 { + let and = _mm512_and_epi32(a, b); + let zero = _mm512_setzero_si512(); + _mm512_cmpeq_epi32_mask(and, zero) +} + +/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi32_mask&expand=5920) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestnmd))] +pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 { + let and = _mm512_and_epi32(a, b); + let zero = _mm512_setzero_si512(); + _mm512_mask_cmpeq_epi32_mask(k, and, zero) +} + +/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi64_mask&expand=5927) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestnmq))] +pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 { + let and = _mm512_and_epi64(a, b); + let zero = _mm512_setzero_si512(); + _mm512_cmpeq_epi64_mask(and, zero) +} + +/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi64_mask&expand=5926) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vptestnmq))] +pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 { + let and = _mm512_and_epi64(a, b); + let zero = _mm512_setzero_si512(); + _mm512_mask_cmpeq_epi64_mask(k, and, zero) +} + /// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671) @@ -27330,6 +27426,46 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_test_epi32_mask() { + let a = _mm512_set1_epi32(1 << 0); + let b = _mm512_set1_epi32(1 << 0 | 1 << 1); + let r = _mm512_test_epi32_mask(a, b); + let e: __mmask16 = 0b11111111_11111111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_test_epi32_mask() { + let a = _mm512_set1_epi32(1 << 0); + let b = _mm512_set1_epi32(1 << 0 | 1 << 1); + let r = _mm512_mask_test_epi32_mask(0, a, b); + assert_eq!(r, 0); + let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b); + let e: __mmask16 = 0b11111111_11111111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_testn_epi32_mask() { + let a = _mm512_set1_epi32(1 << 0); + let b = _mm512_set1_epi32(1 << 0 | 1 << 1); + let r = _mm512_testn_epi32_mask(a, b); + let e: __mmask16 = 0b00000000_00000000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_testn_epi32_mask() { + let a = _mm512_set1_epi32(1 << 0); + let b = _mm512_set1_epi32(1 << 1); + let r = _mm512_mask_test_epi32_mask(0, a, b); + assert_eq!(r, 0); + let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b); + let e: __mmask16 = 0b11111111_11111111; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_stream_ps() { #[repr(align(32))] diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 3dbbc77a89..758a2b563c 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -6258,6 +6258,46 @@ mod tests { assert_eq_m512d(r, a); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_test_epi64_mask() { + let a = _mm512_set1_epi64(1 << 0); + let b = _mm512_set1_epi64(1 << 0 | 1 << 1); + let r = _mm512_test_epi64_mask(a, b); + let e: __mmask8 = 0b11111111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_test_epi64_mask() { + let a = _mm512_set1_epi64(1 << 0); + let b = _mm512_set1_epi64(1 << 0 | 1 << 1); + let r = _mm512_mask_test_epi64_mask(0, a, b); + assert_eq!(r, 0); + let r = _mm512_mask_test_epi64_mask(0b11111111, a, b); + let e: __mmask8 = 0b11111111; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_testn_epi64_mask() { + let a = _mm512_set1_epi64(1 << 0); + let b = _mm512_set1_epi64(1 << 0 | 1 << 1); + let r = _mm512_testn_epi64_mask(a, b); + let e: __mmask8 = 0b00000000; + assert_eq!(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_testn_epi64_mask() { + let a = _mm512_set1_epi64(1 << 0); + let b = _mm512_set1_epi64(1 << 1); + let r = _mm512_mask_testn_epi64_mask(0, a, b); + assert_eq!(r, 0); + let r = _mm512_mask_testn_epi64_mask(0b11111111, a, b); + let e: __mmask8 = 0b11111111; + assert_eq!(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_stream_pd() { #[repr(align(64))] From 71764a1817bee74ca790cf5257d431f3dcee52e4 Mon Sep 17 00:00:00 2001 From: jironglin Date: Mon, 12 Oct 2020 23:37:07 +0000 Subject: [PATCH 13/25] mask_mov: epi32,epi64,ps,pd; maskz_mov: epi32,epi64,ps,pd --- crates/core_arch/avx512f.md | 18 ++-- crates/core_arch/src/x86/avx512f.rs | 130 +++++++++++++++++++++++++ crates/core_arch/src/x86_64/avx512f.rs | 38 ++++++++ 3 files changed, 177 insertions(+), 9 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 43b2597510..57b3266a0d 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -541,10 +541,10 @@ * [x] [`_mm512_mask_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ps&expand=5236) * [x] [`_mm512_mask_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_pd&expand=5236) * [x] [`_mm512_mask_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ps&expand=5236) - * [ ] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236) - * [ ] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236) - * [ ] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236) - * [ ] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236) + * [x] [`_mm512_mask_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=5236) + * [x] [`_mm512_mask_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=5236) + * [x] [`_mm512_mask_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=5236) + * [x] [`_mm512_mask_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=5236) * [x] [`_mm512_mask_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=5236) * [x] [`_mm512_mask_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup_ps&expand=5236) * [x] [`_mm512_mask_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=5236) @@ -618,7 +618,7 @@ * [x] [`_mm512_mask_scalef_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=5236) * [x] [`_mm512_mask_scalef_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=5236) * [x] [`_mm512_mask_set1_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=5236) - * [ ] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) + * [x] [`_mm512_mask_set1_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=5236) * [x] [`_mm512_mask_shuffle_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5236) * [x] [`_mm512_mask_shuffle_f32x4`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32x4&expand=5236) * [x] [`_mm512_mask_shuffle_f64x2`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5236) @@ -839,10 +839,10 @@ * [x] [`_mm512_maskz_min_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ps&expand=5236) * [x] [`_mm512_maskz_min_round_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_pd&expand=5236) * [x] [`_mm512_maskz_min_round_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ps&expand=5236) - * [ ] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236) - * [ ] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236) - * [ ] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236) - * [ ] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236) + * [x] [`_mm512_maskz_mov_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=5236) + * [x] [`_mm512_maskz_mov_epi64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=5236) + * [x] [`_mm512_maskz_mov_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=5236) + * [x] [`_mm512_maskz_mov_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=5236) * [x] [`_mm512_maskz_movedup_pd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=5236) * [x] [`_mm512_maskz_movehdup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movehdup_ps&expand=5236) * [x] [`_mm512_maskz_moveldup_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 1ef726fba5..2ebfcca5b9 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -135,6 +135,98 @@ pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m5 transmute(simd_select_bitmask(k, abs, src.as_f64x8())) } +/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi32&expand=3801) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] +pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i { + let mov = a.as_i32x16(); + transmute(simd_select_bitmask(k, mov, src.as_i32x16())) +} + +/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi32&expand=3802) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa32))] +pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i { + let mov = a.as_i32x16(); + let zero = _mm512_setzero_si512().as_i32x16(); + transmute(simd_select_bitmask(k, mov, zero)) +} + +/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi64&expand=3807) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] +pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i { + let mov = a.as_i64x8(); + transmute(simd_select_bitmask(k, mov, src.as_i64x8())) +} + +/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi64&expand=3808) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovdqa64))] +pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i { + let mov = a.as_i64x8(); + let zero = _mm512_setzero_si512().as_i64x8(); + transmute(simd_select_bitmask(k, mov, zero)) +} + +/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_ps&expand=3825) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] +pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 { + let mov = a.as_f32x16(); + transmute(simd_select_bitmask(k, mov, src.as_f32x16())) +} + +/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_ps&expand=3826) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovaps))] +pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 { + let mov = a.as_f32x16(); + let zero = _mm512_setzero_ps().as_f32x16(); + transmute(simd_select_bitmask(k, mov, zero)) +} + +/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_pd&expand=3819) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] +pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d { + let mov = a.as_f64x8(); + transmute(simd_select_bitmask(k, mov, src.as_f64x8())) +} + +/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_pd&expand=3820) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovapd))] +pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d { + let mov = a.as_f64x8(); + let zero = _mm512_setzero_pd().as_f64x8(); + transmute(simd_select_bitmask(k, mov, zero)) +} + /// Add packed 32-bit integers in a and b, and store the results in dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_epi32&expand=100) @@ -19110,6 +19202,44 @@ mod tests { assert_eq_m512(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_mov_epi32() { + let src = _mm512_set1_epi32(1); + let a = _mm512_set1_epi32(2); + let r = _mm512_mask_mov_epi32(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a); + assert_eq_m512i(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_mov_epi32() { + let a = _mm512_set1_epi32(2); + let r = _mm512_maskz_mov_epi32(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a); + assert_eq_m512i(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_mov_ps() { + let src = _mm512_set1_ps(1.); + let a = _mm512_set1_ps(2.); + let r = _mm512_mask_mov_ps(src, 0, a); + assert_eq_m512(r, src); + let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a); + assert_eq_m512(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_mov_ps() { + let a = _mm512_set1_ps(2.); + let r = _mm512_maskz_mov_ps(0, a); + assert_eq_m512(r, _mm512_setzero_ps()); + let r = _mm512_maskz_mov_ps(0b11111111_11111111, a); + assert_eq_m512(r, a); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_add_epi32() { let a = _mm512_setr_epi32( diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs index 758a2b563c..1b7c4ee088 100644 --- a/crates/core_arch/src/x86_64/avx512f.rs +++ b/crates/core_arch/src/x86_64/avx512f.rs @@ -68,6 +68,44 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_mov_epi64() { + let src = _mm512_set1_epi64(1); + let a = _mm512_set1_epi64(2); + let r = _mm512_mask_mov_epi64(src, 0, a); + assert_eq_m512i(r, src); + let r = _mm512_mask_mov_epi64(src, 0b11111111, a); + assert_eq_m512i(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_mov_epi64() { + let a = _mm512_set1_epi64(2); + let r = _mm512_maskz_mov_epi64(0, a); + assert_eq_m512i(r, _mm512_setzero_si512()); + let r = _mm512_maskz_mov_epi64(0b11111111, a); + assert_eq_m512i(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_mask_mov_pd() { + let src = _mm512_set1_pd(1.); + let a = _mm512_set1_pd(2.); + let r = _mm512_mask_mov_pd(src, 0, a); + assert_eq_m512d(r, src); + let r = _mm512_mask_mov_pd(src, 0b11111111, a); + assert_eq_m512d(r, a); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm512_maskz_mov_pd() { + let a = _mm512_set1_pd(2.); + let r = _mm512_maskz_mov_pd(0, a); + assert_eq_m512d(r, _mm512_setzero_pd()); + let r = _mm512_maskz_mov_pd(0b11111111, a); + assert_eq_m512d(r, a); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm512_mask_add_epi64() { let a = _mm512_setr_epi64(0, 1, -1, i64::MAX, i64::MIN, 100, -100, -32); From be334bb202593eaf848600ee7fb49ab715ce6cd2 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 13:31:26 +0000 Subject: [PATCH 14/25] mask_add: ss,sd --- crates/core_arch/avx512f.md | 8 +- crates/core_arch/src/x86/avx512f.rs | 120 ++ examples/connect5_14_avx512.rs | 2100 +++++++++++++++++++++++++++ 3 files changed, 2224 insertions(+), 4 deletions(-) create mode 100644 examples/connect5_14_avx512.rs diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 57b3266a0d..01328a43f8 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1240,8 +1240,8 @@ * [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236) * [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236) * [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236) - * [ ] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236) - * [ ] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236) + * [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236) + * [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236) * [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236) * [x] [`_mm_mask_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=5236) * [x] [`_mm_mask_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=5236) @@ -1322,8 +1322,8 @@ * [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236) * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236) * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236) - * [ ] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236) - * [ ] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236) + * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236) + * [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236) * [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236) * [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236) * [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 2ebfcca5b9..7804a8ce44 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18417,6 +18417,76 @@ pub unsafe fn _mm512_set_pd( _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) } +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss))] +pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta + extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss))] +pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta + extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd))] +pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta + extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd))] +pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta + extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + /// Equal pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; /// Less-than @@ -28002,4 +28072,54 @@ mod tests { let e = _mm512_set1_epi32(11); assert_eq_m512i(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_add_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_add_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_add_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 60.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_add_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_add_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_add_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 60.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_add_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_add_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_add_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_add_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_add_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_add_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 6.); + assert_eq_m128d(r, e); + } } diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs new file mode 100644 index 0000000000..696adfc70c --- /dev/null +++ b/examples/connect5_14_avx512.rs @@ -0,0 +1,2100 @@ +#![feature(stdsimd, avx512_target_feature)] + +#[cfg(target_arch = "x86")] +use {core_arch::arch::x86::*}; +#[cfg(target_arch = "x86_64")] +use {core_arch::arch::x86_64::*}; + + +use rand::seq::SliceRandom; +use rand::thread_rng; +use rand::Rng; + +use std::cmp; + +use std::time::{Duration, Instant}; + +// types + +#[derive(Clone, Copy, PartialEq, Eq)] +pub enum Color { + Black, + White, + Empty, +} + +type Square = i32; +type Move = i32; +type Side = Color; +type Piece = Color; + +// constants + +const FILE_SIZE: i32 = 15; +const RANK_SIZE: i32 = 15; +const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4; + +const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100); +const MOVE_NONE: Move = -1; +const SCORE_NONE: i32 = -EVAL_INF - 1; + +const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1], + [-3, -2, -1, 1], + [-2, -1, 1, 2], + [-1, 1, 2, 3], + [ 1, 2, 3, 4], + + [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)], + [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)], + [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)], + [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)], + [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)], + + [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)], + [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)], + [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)], + [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)], + [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)], + + [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)], + [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)], + [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)], + [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)], + [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)] + ]; + +const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6]; +const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)]; +const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)]; +const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)]; + +const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, + 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, + 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, + 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, + 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, + 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, + 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, + 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, + 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, + 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, + 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, + 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, + 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, + 0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, + 0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0, 0, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0, 0, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, + 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, + 0, 0, 0, 0, 0, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20, + 0, 0, 0, 0, 0, 0, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19, + 0, 0, 0, 0, 0, 0, 0, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18, + 0, 0, 0, 0, 0, 0, 0, 0, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, + 0, 0, 0, 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31, + 0, 0, 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, + 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0, + 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0, 0, + 0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0, 0, 0, + 0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0, 0, 0, 0] + ]; + +const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, + 0, 0, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, + 0, 0, 0, 0, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, + 0, 0, 0, 0, 15, 14, 13, 6, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + 0, 0, 0, 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 0, 0, 0, 0, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, + 0, 0, 0, 0, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, + 0, 0, 0, 0, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, + 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, + 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, + 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9 + 0, 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10], + + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 0, 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, + 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, + 0, 0, 0, 0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, + 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0] + ]; + +// variables + +static mut Endgame: bool = false; + +// structures + +pub struct Pos { // position + state: [Color; SQUARE_SIZE as usize], + p_turn: Side, + p_last: Move, + + bitboard: [[[i32; 20]; 4]; 3], + +} + +impl Pos { + + pub fn init(&mut self) { // starting position + for i in 0..SQUARE_SIZE as usize { + self.state[i] = Color::Empty; + } + + self.p_turn = Color::Black; + self.p_last = square_make(0, 0); + + //-------------------------------------------- + + for i in 0..4 { + for j in 0..20 { + self.bitboard[Color::Black as usize][i][j] = 0; + } + } + + for i in 0..4 { + for j in 0..20 { + self.bitboard[Color::White as usize][i][j] = 0; + } + } + + for i in 0..2 { + for j in 0..20 { + self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); + } + } + + self.bitboard[Color::Empty as usize][2][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); + self.bitboard[Color::Empty as usize][2][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); + self.bitboard[Color::Empty as usize][2][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); + self.bitboard[Color::Empty as usize][2][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); + self.bitboard[Color::Empty as usize][2][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); + self.bitboard[Color::Empty as usize][2][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); + self.bitboard[Color::Empty as usize][2][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); + self.bitboard[Color::Empty as usize][2][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); + self.bitboard[Color::Empty as usize][2][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19); + self.bitboard[Color::Empty as usize][2][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); + self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); + self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); + self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); + self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); + self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); + self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); + self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); + self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); + self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); + self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); + + self.bitboard[Color::Empty as usize][3][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); + self.bitboard[Color::Empty as usize][3][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); + self.bitboard[Color::Empty as usize][3][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); + self.bitboard[Color::Empty as usize][3][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); + self.bitboard[Color::Empty as usize][3][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); + self.bitboard[Color::Empty as usize][3][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); + self.bitboard[Color::Empty as usize][3][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); + self.bitboard[Color::Empty as usize][3][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); + self.bitboard[Color::Empty as usize][3][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19); + self.bitboard[Color::Empty as usize][3][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); + self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); + self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); + self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); + self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); + self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); + self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); + self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); + self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); + self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); + self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); + } + + pub fn do_move(&mut self, mv: Move) { + + let atk: Side = self.p_turn; + let def: Side = side_opp(atk); + + match self.p_turn { + Color::Black => { self.state[mv as usize] = Color::Black; + + for i in 0..4 { + self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize]; + self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize]; + } + }, + + Color::White => { self.state[mv as usize] = Color::White; + + for i in 0..4 { + self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize]; + self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize]; + } + }, + + Color::Empty => {}, + } + + self.p_last = mv; + + self.p_turn = def; + } + + fn turn(&self) -> Side { + self.p_turn + } + + pub fn can_play(&self, from: Square) -> bool { + + if self.state[from as usize] == Color::Empty { true } else { false } + } + + pub fn count(&self, pc: Piece) -> i32 { + + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq: Square = square_make(fl, rk); + if self.state[sq as usize] == pc { n += 1; } + } + } + n + } +} + +pub struct List { // legal move list + p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize], + p_size: i32, +} + +impl List { + + pub fn clear(&mut self) { + self.p_size = 0; + } + + pub fn add(&mut self, mv: Move) { + self.p_move[self.p_size as usize] = mv; + self.p_size += 1; + } + + pub fn size(&self) -> i32 { + self.p_size + } + + pub fn shuffle(&mut self) { + + let mut rng = thread_rng(); + + let num = self.p_size; + + let mut new_move: Vec = vec![]; + + for x in 0..(num as usize) { + new_move.push(self.p_move[x]); + } + + new_move.shuffle(&mut rng); + + for x in 0..(self.p_size as usize) { + self.p_move[x] = new_move[x]; + } + } + + //pub fn move(&self, i: i32) -> Move { + // self.p_move[i as usize] + //} +} + +// functions +// +fn square_make(fl: i32, rk: i32) -> Square { + (rk + 4) * (FILE_SIZE + 4) + (fl + 4) +} + +fn square_file(sq: Square) -> i32 { + sq % (FILE_SIZE + 4) - 4 +} + +fn square_rank(sq: Square) -> i32 { + sq / (FILE_SIZE + 4) - 4 +} + +fn side_opp(sd: Side) -> Side { + + let mut out: Side; + + match sd { + Side::White => out = Side::Black, + Side::Black => out = Side::White, + Side::Empty => panic!(""), + } + + out +} + +fn pos_is_winner(pos : &Pos) -> bool { + + let current_side = side_opp(pos.p_turn); + + let mut found : bool = true; + + for x in 0..20 { + for y in 0..4 { + + found = true; + + let adj = pos.p_last + ENDCHECK[x][y]; + + if pos.state[adj as usize] != current_side { found = false; break } + } + if found == true { break; } + } + + found +} + +fn pos_is_winner_scan(pos : &Pos) -> bool { + + let current_side = side_opp(pos.p_turn); + + if check_patternfile5(&pos, current_side) || + check_patternrank5(&pos, current_side) || + check_patterndial5(&pos, current_side) || + check_patterndiar5(&pos, current_side) { return true } + + false +} + +fn pos_is_draw(pos : &Pos) -> bool { + + + let mut found : bool = true; + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + + let sq: Square = square_make(fl, rk); + if pos.can_play(sq) { + found = false; + break; + } + + if found == false { break;} + } + } +/* + + let mut test: bool = false; + + if pos.bitboard[Color::Empty as usize][0][0] == 0 && + pos.bitboard[Color::Empty as usize][0][1] == 0 && + pos.bitboard[Color::Empty as usize][0][2] == 0 && + pos.bitboard[Color::Empty as usize][0][3] == 0 && + pos.bitboard[Color::Empty as usize][0][4] == 0 && + pos.bitboard[Color::Empty as usize][0][5] == 0 && + pos.bitboard[Color::Empty as usize][0][6] == 0 && + pos.bitboard[Color::Empty as usize][0][7] == 0 && + pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; } +*/ + //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); } + + let mut out: bool = false; + + //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; } + if found == true && !pos_is_winner_scan(pos) { out = true; } + + out +} + +fn pos_is_end(pos : &Pos) -> bool { + + if pos_is_winner_scan(pos) || pos_is_draw(pos) { + true + } else { + false + } +} + +fn pos_disp(pos: &Pos) { + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + + let sq: Square = square_make(fl, rk); + + match pos.state[sq as usize] { + Color::Black => print!("# "), + Color::White => print!("O "), + Color::Empty => print!("- "), + } + } + + println!(""); + } + + match pos.turn() { + Color::Black => println!("black to play"), + Color::White => println!("white to play"), + _ => (), + } +} + +fn gen_moves(list : &mut List, pos: &Pos) { + + list.clear(); + + for rk in 0..RANK_SIZE { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + if pos.can_play(sq) { list.add(sq); } + } + } + +} + +fn search(pos : &Pos, depth: i32, endgame: i32) -> Move { + + //println!("call search"); + + let mut new_depth = depth; + + let empties: i32 = pos.count(Color::Empty); + if (empties <= endgame || new_depth > empties ) { new_depth = empties; } + + if(new_depth == empties) { unsafe { Endgame = true; } } + + search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0) + +} + +fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 { + + + assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF); + //println!("call search_real"); + //println!("depth = {}", depth); + //println!("ply = {}", ply); + // leaf? + + //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); } + if pos_is_winner_scan(&pos) { return -EVAL_INF + ply } + //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply } + + + if pos_is_draw(&pos) { return 0 } + + if depth == 0 { + return eval(&pos, ply) + } + + let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize]; + + let mut list = List { + p_move: p_move_new, + p_size: 0, + }; + + let mut bm: Move = MOVE_NONE; + let mut bs: i32 = SCORE_NONE; + + gen_moves(&mut list, &pos); + + // move loop + + if ply == 0 { list.shuffle(); } + + for i in 0..list.size() { + + if bs < beta { + + let mv: Move = list.p_move[i as usize]; + + let mut new_pos = Pos { + state: pos.state, + p_turn: pos.p_turn, + p_last: pos.p_last, + + bitboard: pos.bitboard, + }; + + //println!("p_last = {}", new_pos.p_last); + + new_pos.do_move(mv); + + //println!("After do _move p_last = {}", new_pos.p_last); + + let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1); + + + //if sc >= 410 || sc <= -410 { + //println!("sc = {} depth = {}-------------------------------", sc, depth); + + //pos_disp(&new_pos); + //} + + + if sc > bs { bm = mv; bs = sc; } + + } + } + + assert!(bm != MOVE_NONE); + assert!(bs >= -EVAL_INF && bs <= EVAL_INF); + + if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere + //bs +} + +fn result(pos: &Pos) -> i32 { + + if(pos_is_winner_scan(pos)) { + -(FILE_SIZE*RANK_SIZE*100) + } else { + 0 + } +} + + +fn eval(pos: &Pos, ply: i32) -> i32 { + + let atk: Side = pos.turn(); + let def: Side = side_opp(atk); + + //let mut sc: i32 = 0; + + let check_live4: Side = def; + let check_live4_opp: Side = atk; + + //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; } + //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; } +/* + if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self "); pos_disp(&pos); } + if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp "); pos_disp(&pos); } + + #[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern4_dead_avx512(&pos, check_live4_opp); + + let mut temp_check: [__mmask16; 5] = [0; 5]; + + for i in 0..5 { + + let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]); + let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]); + let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]); + let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]); + let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]); + let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]); + let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]); + let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]); + let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]); + let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]); + + let check_mask10 = _kor_mask16(check_mask0, check_mask1); + let check_mask11 = _kor_mask16(check_mask2, check_mask3); + let check_mask12 = _kor_mask16(check_mask4, check_mask5); + let check_mask13 = _kor_mask16(check_mask6, check_mask7); + let check_mask14 = _kor_mask16(check_mask8, check_mask9); + + let check_mask16 = _kor_mask16(check_mask10, check_mask11); + let check_mask17 = _kor_mask16(check_mask12, check_mask13); + let check_mask18 = _kor_mask16(check_mask16, check_mask17); + temp_check[i] = _kor_mask16(check_mask18, check_mask14); + + } + + let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]); + let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]); + let check_mask2 = _kor_mask16(check_mask0, check_mask1); + let check_mask3 = _kor_mask16(check_mask2, temp_check[4]); + + let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp); + + let mut test2: bool = true; + + if check_mask3 > 0 { test2 = true; } else { test2 = false; } + + if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp "); pos_disp(&pos); } + } + + #[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern4_dead_avx512(&pos, check_live4); + + let mut count: i32 = 0; + + for i in 0..5 { + for j in 0..4 { + for k in 0..5 { + count += _popcnt32(result[i][j][k] as i32); + } + } + } + + let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4); + let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4); + let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4); + let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4); + + + if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count); pos_disp(&pos); } + } + + #[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern3_live_avx512(&pos, check_live4); + + let mut count: i32 = 0; + + for i in 0..3 { + for j in 0..4 { + for k in 0..5 { + count += _popcnt32(result[i][j][k] as i32); + } + } + } + + let c3f: i32 = check_patternfile3_live_n(&pos, check_live4); + let c3r: i32 = check_patternrank3_live_n(&pos, check_live4); + let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4); + let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4); + + let mut count1: i32 = 0; + + count1 = c3f+c3r+c3dl+c3dr; + + if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count); pos_disp(&pos); } + } +*/ + + if check_patternfile4_once(&pos, check_live4) || + check_patternrank4_once(&pos, check_live4) || + check_patterndial4_once(&pos, check_live4) || + check_patterndiar4_once(&pos, check_live4) { return -4096 } + + //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 } + + if check_patternfile4_once(&pos, check_live4_opp) || + check_patternrank4_once(&pos, check_live4_opp) || + check_patterndial4_once(&pos, check_live4_opp) || + check_patterndiar4_once(&pos, check_live4_opp) { return 2560 } + + //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 } + + if check_patternfile4_dead(&pos, check_live4_opp) || + check_patternrank4_dead(&pos, check_live4_opp) || + check_patterndial4_dead(&pos, check_live4_opp) || + check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 } + + /*#[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern4_dead_avx512(&pos, check_live4_opp); + + let mut temp_check: [__mmask16; 5] = [0; 5]; + + for i in 0..5 { + let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]); + let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]); + let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]); + let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]); + let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]); + let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]); + let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]); + let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]); + let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]); + let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]); + + let check_mask10 = _kor_mask16(check_mask0, check_mask1); + let check_mask11 = _kor_mask16(check_mask2, check_mask3); + let check_mask12 = _kor_mask16(check_mask4, check_mask5); + let check_mask13 = _kor_mask16(check_mask6, check_mask7); + let check_mask14 = _kor_mask16(check_mask8, check_mask9); + + let check_mask16 = _kor_mask16(check_mask10, check_mask11); + let check_mask17 = _kor_mask16(check_mask12, check_mask13); + let check_mask18 = _kor_mask16(check_mask16, check_mask17); + temp_check[i] = _kor_mask16(check_mask18, check_mask14); + } + + let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]); + let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]); + let check_mask2 = _kor_mask16(check_mask0, check_mask1); + let check_mask3 = _kor_mask16(check_mask2, temp_check[4]); + + if check_mask3 > 0 { return 2560 } + } +*/ + // 4,3 + let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4); + let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4); + let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4); + let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4); + + let c3f: i32 = check_patternfile3_live_n(&pos, check_live4); + let c3r: i32 = check_patternrank3_live_n(&pos, check_live4); + let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4); + let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4); + + let n_c4: i32 = c4f + c4r + c4dl + c4dr; + + if n_c4 > 1 { return -2048 } + + if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 } + +/* + #[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern4_dead_avx512(&pos, check_live4); + + let mut count4: i32 = 0; + + for i in 0..5 { + for j in 0..4 { + for k in 0..5 { + count4 += _popcnt32(result[i][j][k] as i32); + } + } + } + + if count4 > 1 { return -2048 } + else if count4 == 1 { + + let result = check_pattern3_live_avx512(&pos, check_live4); + + let mut count3: i32 = 0; + + for i in 0..3 { + for j in 0..4 { + for k in 0..5 { + count3 += _popcnt32(result[i][j][k] as i32); + } + } + } + + if count3 > 0 { return -3048 } + } + } + */ + //--------------------------------------------------------------------------- + + let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp); + let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp); + let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp); + let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp); + if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 } + + if c3f + c3r + c3dl + c3dr > 1 { return -2048 } + /* + #[target_feature(enable = "avx512f")] + unsafe { + let result = check_pattern3_live_avx512(&pos, check_live4_opp); + + let mut count: i32 = 0; + + for i in 0..3 { + for j in 0..4 { + for k in 0..5 { + count += _popcnt32(result[i][j][k] as i32); + } + } + } + + let c3f: i32 = check_patternfile3_live_n(&pos, check_live4_opp); + let c3r: i32 = check_patternrank3_live_n(&pos, check_live4_opp); + let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp); + let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp); + + let mut count1: i32 = 0; + + count1 = c3f+c3r+c3dl+c3dr; + + if count1 > 1 { return -2048 } + } +*/ + 0 +} + + +fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool { + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + let idx5 = sq + PATTERNFILE4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + let idx5 = sq + PATTERNRANK4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool { + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + let idx5 = sq + PATTERNDIAL4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 5) { + for fl in 5..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + let idx5 = sq + PATTERNDIAR4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool { + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 4..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + + +fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 { + + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + + n +} + +fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 { + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + + n +} + +fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 { + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + + n +} + +fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 { + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 4..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } + if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } + if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } + } + } + + n +} + + +/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool { + + let last_move: Move = pos.p_last; + + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + } + } + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + let idx5 = sq + PATTERNFILE4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool { + + let last_move: Move = pos.p_last; + + // let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + let idx5 = sq + PATTERNRANK4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool { + + let last_move: Move = pos.p_last; + + //let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + let idx5 = sq + PATTERNDIAL4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} + +fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool { + + let last_move: Move = pos.p_last; + + //let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 4..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 5..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + let idx5 = sq + PATTERNDIAR4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } + } + } + + false +} +*/ + +fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 { + + let last_move: Move = pos.p_last; + + let mut n: i32 = 0; + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; } + } + } + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + let idx5 = sq + PATTERNFILE4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + + n +} + +fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 { + + let last_move: Move = pos.p_last; + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + let idx5 = sq + PATTERNRANK4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + + n +} + +fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 { + + let last_move: Move = pos.p_last; + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 0..(FILE_SIZE - 5) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + let idx5 = sq + PATTERNDIAL4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + + n +} + +fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 { + + let last_move: Move = pos.p_last; + + let mut n: i32 = 0; + + for rk in 0..(RANK_SIZE - 4) { + for fl in 4..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } + } + } + + for rk in 0..(RANK_SIZE - 5) { + for fl in 5..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + let idx5 = sq + PATTERNDIAR4[5]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + let val5 = pos.state[idx5 as usize]; + + if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } + if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } + } + } + + n +} + +#[target_feature(enable = "avx512f")] +unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool { + + let current_side = side_opp(pos.p_turn); + + let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ); + + let answer_mask: __mmask16 = 0b01111111_11111111; + + let coloridx = current_side as usize; + + let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4]; + + for dir in 0..4 { + let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); + + let boardf = _mm512_and_epi32(answer, board0); + + temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + + for i in 1..5 { + + let board1 = _mm512_rol_epi32(board0, i); + + let boardf = _mm512_and_epi32(answer, board1); + + temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + } + } + + let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); + let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); + let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]); + let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]); + let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]); + let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); + let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); + let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]); + let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]); + let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]); + + let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); + let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); + let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); + let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); + let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9); + + let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); + let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); + let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17); + let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14); + + if check_mask19 > 0 { return true } else { return false } +} + +#[target_feature(enable = "avx512f")] +unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool { + + //let current_side = side_opp(sd); + + let answer_color = _mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ); + let answer_empty = _mm512_set1_epi32( (1<<31)| (1<<26) ); + let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)); + + let answer_mask: __mmask16 = 0b00000001_11111111; + + //let coloridx = current_side as usize; + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4]; + + for dir in 0..4 { + + let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); + + let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); + + let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd + let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + + for i in 1..4 { //only move 3 times + + let board2 = _mm512_rol_epi32(board0, i);//rot sd + let board3 = _mm512_rol_epi32(board1, i);//rot empty + + let boardf1 = _mm512_and_epi32(answer_color, board2); + let boardf2 = _mm512_and_epi32(answer_empty, board3); + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + } + } + + let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); + let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); + let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]); + let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]); + let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); + let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); + let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]); + let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]); + + let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); + let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); + let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); + let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); + + let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); + let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); + let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17); + + if check_mask19 > 0 { return true } else { return false } +} + +#[target_feature(enable = "avx512f")] +unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] { + + //let current_side = side_opp(sd); + + let answer_color: [__m512i; 5] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ), + _mm512_set1_epi32( (1<<31)| (1<<29)|(1<<28)|(1<<27) ), + _mm512_set1_epi32( (1<<31)|(1<<30) |(1<<28)|(1<<27) ), + _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29) |(1<<27) ), + _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28) )]; + + let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ), + _mm512_set1_epi32( (1<<30) ), + _mm512_set1_epi32( (1<<29) ), + _mm512_set1_epi32( (1<<28) ), + _mm512_set1_epi32( (1<<27) )]; + + let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)); + + let answer_mask: __mmask16 = 0b00000001_11111111; + + //let coloridx = current_side as usize; + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5]; + + for pattern in 0..5 { + + for dir in 0..4 { + + let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); + + let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); + + let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd + let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + + for i in 1..5 { //only move 4 times + + let board2 = _mm512_rol_epi32(board0, i);//rot sd + let board3 = _mm512_rol_epi32(board1, i);//rot empty + + let boardf1 = _mm512_and_epi32(answer_color[pattern], board2); + let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3); + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above + } + } + } + + temp_mask +} + + +#[target_feature(enable = "avx512f")] +unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] { + + //let current_side = side_opp(sd); + + let answer_color: [__m512i; 3] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28) ), + _mm512_set1_epi32( (1<<30)| (1<<28)|(1<<27) ), + _mm512_set1_epi32( (1<<30)|(1<<29) |(1<<27) )]; + + let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)| (1<<27) ), + _mm512_set1_epi32( (1<<31)| (1<<29)| (1<<26) ), + _mm512_set1_epi32( (1<<31)| (1<<28)| (1<<26) )]; + + //let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)); + let answer: [__m512i; 3] = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ), + _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ), + _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )]; + + let answer_mask: __mmask16 = 0b00000001_11111111; + + //let coloridx = current_side as usize; + let coloridx = sd as usize; + let emptyidx = Color::Empty as usize; + + let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3]; + + for pattern in 0..3 { + + for dir in 0..4 { + + let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); + + let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); + + let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd + let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above + + for i in 1..5 { //only move 4 times + + let board2 = _mm512_rol_epi32(board0, i);//rot sd + let board3 = _mm512_rol_epi32(board1, i);//rot empty + + let boardf1 = _mm512_and_epi32(answer_color[pattern], board2); + let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3); + let boardf = _mm512_or_epi32(boardf1, boardf2); + + temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above + } + } + } + + temp_mask +} + +fn check_patternfile5(pos: &Pos, sd: Side) -> bool { + + for rk in 0..RANK_SIZE { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNFILE4[0]; + let idx1 = sq + PATTERNFILE4[1]; + let idx2 = sq + PATTERNFILE4[2]; + let idx3 = sq + PATTERNFILE4[3]; + let idx4 = sq + PATTERNFILE4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patternrank5(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNRANK4[0]; + let idx1 = sq + PATTERNRANK4[1]; + let idx2 = sq + PATTERNRANK4[2]; + let idx3 = sq + PATTERNRANK4[3]; + let idx4 = sq + PATTERNRANK4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patterndial5(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 0..(FILE_SIZE - 4) { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAL4[0]; + let idx1 = sq + PATTERNDIAL4[1]; + let idx2 = sq + PATTERNDIAL4[2]; + let idx3 = sq + PATTERNDIAL4[3]; + let idx4 = sq + PATTERNDIAL4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn check_patterndiar5(pos: &Pos, sd: Side) -> bool { + + for rk in 0..(RANK_SIZE - 4) { + for fl in 4..FILE_SIZE { + let sq : Square = square_make(fl, rk); + + let idx0 = sq + PATTERNDIAR4[0]; + let idx1 = sq + PATTERNDIAR4[1]; + let idx2 = sq + PATTERNDIAR4[2]; + let idx3 = sq + PATTERNDIAR4[3]; + let idx4 = sq + PATTERNDIAR4[4]; + + let val0 = pos.state[idx0 as usize]; + let val1 = pos.state[idx1 as usize]; + let val2 = pos.state[idx2 as usize]; + let val3 = pos.state[idx3 as usize]; + let val4 = pos.state[idx4 as usize]; + + if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } + } + } + + false +} + +fn main() { + + loop + { + + let start = Instant::now(); + + println!("Hello, this is connect 6!"); + + //unsafe { test_avx512(); } + + let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize]; + + let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3]; + + let mut test1 = Pos { + state: test_state, + p_turn: Color::Black, + p_last: square_make(5,5), + + bitboard: test_bitboard, + }; + + test1.init(); + + //pos_disp(&test1); + + for i in 0..(FILE_SIZE*RANK_SIZE) { + + // println!("----------------------------------------\n\n\n\n"); + // println!("MOVE {}!!!!\n\n\n\n", i); + + + let mut d = 2; + let mut e = 4; + + //if i < 6 { d = 1; e = 2; } + + let next_move: Move = search(&test1, d, e); + //println!("next move is {}", next_move); + //println!("file is {}", square_file(next_move)); + //println!("rank is {}", square_rank(next_move)); + + test1.do_move(next_move); + + //pos_disp(&test1); + + if pos_is_end(&test1) { + + println!("Game over!!!!!!"); + println!("MOVE {}!!!!\n", i); + //pos_disp(&test1); + + break; } + } + + + let duration = start.elapsed(); + + println!("Time elapsed in expensive_function() is: {:?}", duration); + } + + +} From b4023d1bdb6fd4e0a3c1c5cf282724567953c14e Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 13:32:07 +0000 Subject: [PATCH 15/25] mask_add: ss,sd --- examples/connect5_14_avx512.rs | 2100 -------------------------------- 1 file changed, 2100 deletions(-) delete mode 100644 examples/connect5_14_avx512.rs diff --git a/examples/connect5_14_avx512.rs b/examples/connect5_14_avx512.rs deleted file mode 100644 index 696adfc70c..0000000000 --- a/examples/connect5_14_avx512.rs +++ /dev/null @@ -1,2100 +0,0 @@ -#![feature(stdsimd, avx512_target_feature)] - -#[cfg(target_arch = "x86")] -use {core_arch::arch::x86::*}; -#[cfg(target_arch = "x86_64")] -use {core_arch::arch::x86_64::*}; - - -use rand::seq::SliceRandom; -use rand::thread_rng; -use rand::Rng; - -use std::cmp; - -use std::time::{Duration, Instant}; - -// types - -#[derive(Clone, Copy, PartialEq, Eq)] -pub enum Color { - Black, - White, - Empty, -} - -type Square = i32; -type Move = i32; -type Side = Color; -type Piece = Color; - -// constants - -const FILE_SIZE: i32 = 15; -const RANK_SIZE: i32 = 15; -const SQUARE_SIZE: i32 = (FILE_SIZE + 4) * (FILE_SIZE + 4 * 2 ) + 4; - -const EVAL_INF: i32 = (FILE_SIZE * RANK_SIZE * 100); -const MOVE_NONE: Move = -1; -const SCORE_NONE: i32 = -EVAL_INF - 1; - -const ENDCHECK: [[i32; 4]; 20] = [ [-4, -3, -2, -1], - [-3, -2, -1, 1], - [-2, -1, 1, 2], - [-1, 1, 2, 3], - [ 1, 2, 3, 4], - - [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 4 * (-FILE_SIZE - 4)], - [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 3 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4)], - [1 * (-FILE_SIZE - 4), 2 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4)], - [1 * (-FILE_SIZE - 4), 1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4)], - [1 * ( FILE_SIZE + 4), 2 * ( FILE_SIZE + 4), 3 * ( FILE_SIZE + 4), 4 * ( FILE_SIZE + 4)], - - [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 4 * (-FILE_SIZE - 5)], - [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 3 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5)], - [1 * (-FILE_SIZE - 5), 2 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5)], - [1 * (-FILE_SIZE - 5), 1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5)], - [1 * ( FILE_SIZE + 5), 2 * ( FILE_SIZE + 5), 3 * ( FILE_SIZE + 5), 4 * ( FILE_SIZE + 5)], - - [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 4 * (-FILE_SIZE - 3)], - [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 3 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3)], - [1 * (-FILE_SIZE - 3), 2 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3)], - [1 * (-FILE_SIZE - 3), 1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3)], - [1 * ( FILE_SIZE + 3), 2 * ( FILE_SIZE + 3), 3 * ( FILE_SIZE + 3), 4 * ( FILE_SIZE + 3)] - ]; - -const PATTERNFILE4: [i32; 7] = [0, 1, 2, 3, 4, 5, 6]; -const PATTERNRANK4: [i32; 7] = [0, 1 * (FILE_SIZE + 4), 2 * (FILE_SIZE + 4), 3 * (FILE_SIZE + 4), 4 * (FILE_SIZE + 4), 5 * (FILE_SIZE + 4), 6 * (FILE_SIZE + 4)]; -const PATTERNDIAL4: [i32; 7] = [0, 1 * (FILE_SIZE + 5), 2 * (FILE_SIZE + 5), 3 * (FILE_SIZE + 5), 4 * (FILE_SIZE + 5), 5 * (FILE_SIZE + 5), 6 * (FILE_SIZE + 5)]; -const PATTERNDIAR4: [i32; 7] = [0, 1 * (FILE_SIZE + 3), 2 * (FILE_SIZE + 3), 3 * (FILE_SIZE + 3), 4 * (FILE_SIZE + 3), 5 * (FILE_SIZE + 3), 6 * (FILE_SIZE + 3)]; - -const MAPMOVEVALUE: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21. 1<<20, 1<<19, 1<<18, 1<<17], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, - 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, - 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, - 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, - 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, - 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, - 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, - 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, - 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, - 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, - 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, - 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, 1<<20, - 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, 1<<19, - 0, 0, 0, 0, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, 1<<18, - 0, 0, 0, 0, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17, 1<<17], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 0, 0, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 0, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 0, 0, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 0, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, - 0, 0, 0, 0, 1<<31, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, - 0, 0, 0, 0, 0, 1<<30, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<20, 1<<20, 1<<20, - 0, 0, 0, 0, 0, 0, 1<<29, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<19, 1<<19, - 0, 0, 0, 0, 0, 0, 0, 1<<28, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<18, - 0, 0, 0, 0, 0, 0, 0, 0, 1<<27, 1<<26, 1<<25, 1<<24, 1<<23, 1<<22, 1<<21, 1<<20, 1<<19, 1<<18, 1<<17], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, 1<<31, - 0, 0, 0, 0, 0, 0, 0, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<30, 1<<31, - 0, 0, 0, 0, 0, 0, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 0, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<21, 1<<21, 1<<21, 1<<21, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31, - 0, 0, 0, 0, 1<<20, 1<<20, 1<<20, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 0, - 0, 0, 0, 0, 1<<19, 1<<19, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 0, 0, - 0, 0, 0, 0, 1<<18, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 0, 0, 0, - 0, 0, 0, 0, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 0, 0, 0, 0] - ]; - -const MAPMOVEIDX: [[i32; 367]; 4] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 0, 0, 0, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 0, 0, 0, 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 0, 0, 0, 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 0, 0, 0, 0, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, - 0, 0, 0, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, - 0, 0, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, - 0, 0, 0, 0, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, - 0, 0, 0, 0, 15, 14, 13, 6, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - 0, 0, 0, 0, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, - 0, 0, 0, 0, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, - 0, 0, 0, 0, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, - 0, 0, 0, 0, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, - 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, - 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, - 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9 - 0, 0, 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10], - - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, - 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 0, 0, 0, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 0, 0, 0, 0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 0, 0, 0, 0, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, - 0, 0, 0, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, - 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, - 0, 0, 0, 0, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, - 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 0, 0, 0, 0] - ]; - -// variables - -static mut Endgame: bool = false; - -// structures - -pub struct Pos { // position - state: [Color; SQUARE_SIZE as usize], - p_turn: Side, - p_last: Move, - - bitboard: [[[i32; 20]; 4]; 3], - -} - -impl Pos { - - pub fn init(&mut self) { // starting position - for i in 0..SQUARE_SIZE as usize { - self.state[i] = Color::Empty; - } - - self.p_turn = Color::Black; - self.p_last = square_make(0, 0); - - //-------------------------------------------- - - for i in 0..4 { - for j in 0..20 { - self.bitboard[Color::Black as usize][i][j] = 0; - } - } - - for i in 0..4 { - for j in 0..20 { - self.bitboard[Color::White as usize][i][j] = 0; - } - } - - for i in 0..2 { - for j in 0..20 { - self.bitboard[Color::Empty as usize][i][j] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); - } - } - - self.bitboard[Color::Empty as usize][2][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); - self.bitboard[Color::Empty as usize][2][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); - self.bitboard[Color::Empty as usize][2][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); - self.bitboard[Color::Empty as usize][2][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); - self.bitboard[Color::Empty as usize][2][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); - self.bitboard[Color::Empty as usize][2][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); - self.bitboard[Color::Empty as usize][2][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); - self.bitboard[Color::Empty as usize][2][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); - self.bitboard[Color::Empty as usize][2][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19); - self.bitboard[Color::Empty as usize][2][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); - self.bitboard[Color::Empty as usize][2][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); - self.bitboard[Color::Empty as usize][2][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); - self.bitboard[Color::Empty as usize][2][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); - self.bitboard[Color::Empty as usize][2][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); - self.bitboard[Color::Empty as usize][2][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); - self.bitboard[Color::Empty as usize][2][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); - self.bitboard[Color::Empty as usize][2][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); - self.bitboard[Color::Empty as usize][2][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); - self.bitboard[Color::Empty as usize][2][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); - self.bitboard[Color::Empty as usize][2][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); - - self.bitboard[Color::Empty as usize][3][0] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); - self.bitboard[Color::Empty as usize][3][1] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); - self.bitboard[Color::Empty as usize][3][2] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); - self.bitboard[Color::Empty as usize][3][3] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); - self.bitboard[Color::Empty as usize][3][4] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); - self.bitboard[Color::Empty as usize][3][5] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); - self.bitboard[Color::Empty as usize][3][6] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); - self.bitboard[Color::Empty as usize][3][7] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); - self.bitboard[Color::Empty as usize][3][8] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19); - self.bitboard[Color::Empty as usize][3][9] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); - self.bitboard[Color::Empty as usize][3][10] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18)|(1<<17); - self.bitboard[Color::Empty as usize][3][11] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20)|(1<<19)|(1<<18); - self.bitboard[Color::Empty as usize][3][12] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21)|(1<<20); - self.bitboard[Color::Empty as usize][3][13] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22)|(1<<21); - self.bitboard[Color::Empty as usize][3][14] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23)|(1<<22); - self.bitboard[Color::Empty as usize][3][15] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24)|(1<<23); - self.bitboard[Color::Empty as usize][3][16] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25)|(1<<24); - self.bitboard[Color::Empty as usize][3][17] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)|(1<<25); - self.bitboard[Color::Empty as usize][3][18] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26); - self.bitboard[Color::Empty as usize][3][19] = (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27); - } - - pub fn do_move(&mut self, mv: Move) { - - let atk: Side = self.p_turn; - let def: Side = side_opp(atk); - - match self.p_turn { - Color::Black => { self.state[mv as usize] = Color::Black; - - for i in 0..4 { - self.bitboard[Color::Black as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize]; - self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize]; - } - }, - - Color::White => { self.state[mv as usize] = Color::White; - - for i in 0..4 { - self.bitboard[Color::White as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] |= MAPMOVEVALUE[i][mv as usize]; - self.bitboard[Color::Empty as usize][i][ MAPMOVEIDX[i][mv as usize] as usize ] ^= MAPMOVEVALUE[i][mv as usize]; - } - }, - - Color::Empty => {}, - } - - self.p_last = mv; - - self.p_turn = def; - } - - fn turn(&self) -> Side { - self.p_turn - } - - pub fn can_play(&self, from: Square) -> bool { - - if self.state[from as usize] == Color::Empty { true } else { false } - } - - pub fn count(&self, pc: Piece) -> i32 { - - let mut n: i32 = 0; - - for rk in 0..RANK_SIZE { - for fl in 0..FILE_SIZE { - let sq: Square = square_make(fl, rk); - if self.state[sq as usize] == pc { n += 1; } - } - } - n - } -} - -pub struct List { // legal move list - p_move: [Move; (FILE_SIZE * RANK_SIZE) as usize], - p_size: i32, -} - -impl List { - - pub fn clear(&mut self) { - self.p_size = 0; - } - - pub fn add(&mut self, mv: Move) { - self.p_move[self.p_size as usize] = mv; - self.p_size += 1; - } - - pub fn size(&self) -> i32 { - self.p_size - } - - pub fn shuffle(&mut self) { - - let mut rng = thread_rng(); - - let num = self.p_size; - - let mut new_move: Vec = vec![]; - - for x in 0..(num as usize) { - new_move.push(self.p_move[x]); - } - - new_move.shuffle(&mut rng); - - for x in 0..(self.p_size as usize) { - self.p_move[x] = new_move[x]; - } - } - - //pub fn move(&self, i: i32) -> Move { - // self.p_move[i as usize] - //} -} - -// functions -// -fn square_make(fl: i32, rk: i32) -> Square { - (rk + 4) * (FILE_SIZE + 4) + (fl + 4) -} - -fn square_file(sq: Square) -> i32 { - sq % (FILE_SIZE + 4) - 4 -} - -fn square_rank(sq: Square) -> i32 { - sq / (FILE_SIZE + 4) - 4 -} - -fn side_opp(sd: Side) -> Side { - - let mut out: Side; - - match sd { - Side::White => out = Side::Black, - Side::Black => out = Side::White, - Side::Empty => panic!(""), - } - - out -} - -fn pos_is_winner(pos : &Pos) -> bool { - - let current_side = side_opp(pos.p_turn); - - let mut found : bool = true; - - for x in 0..20 { - for y in 0..4 { - - found = true; - - let adj = pos.p_last + ENDCHECK[x][y]; - - if pos.state[adj as usize] != current_side { found = false; break } - } - if found == true { break; } - } - - found -} - -fn pos_is_winner_scan(pos : &Pos) -> bool { - - let current_side = side_opp(pos.p_turn); - - if check_patternfile5(&pos, current_side) || - check_patternrank5(&pos, current_side) || - check_patterndial5(&pos, current_side) || - check_patterndiar5(&pos, current_side) { return true } - - false -} - -fn pos_is_draw(pos : &Pos) -> bool { - - - let mut found : bool = true; - - for rk in 0..RANK_SIZE { - for fl in 0..FILE_SIZE { - - let sq: Square = square_make(fl, rk); - if pos.can_play(sq) { - found = false; - break; - } - - if found == false { break;} - } - } -/* - - let mut test: bool = false; - - if pos.bitboard[Color::Empty as usize][0][0] == 0 && - pos.bitboard[Color::Empty as usize][0][1] == 0 && - pos.bitboard[Color::Empty as usize][0][2] == 0 && - pos.bitboard[Color::Empty as usize][0][3] == 0 && - pos.bitboard[Color::Empty as usize][0][4] == 0 && - pos.bitboard[Color::Empty as usize][0][5] == 0 && - pos.bitboard[Color::Empty as usize][0][6] == 0 && - pos.bitboard[Color::Empty as usize][0][7] == 0 && - pos.bitboard[Color::Empty as usize][0][8] == 0 { test = true; } else { test = false; } -*/ - //if test != found { println!("bitboard!!!!!!!!!!!!!!!!!!!! pos_is_draw wrong!!!!!!!!!!!!!!"); } - - let mut out: bool = false; - - //if test && unsafe {!pos_is_winner_avx512(pos)} { out = true; } - if found == true && !pos_is_winner_scan(pos) { out = true; } - - out -} - -fn pos_is_end(pos : &Pos) -> bool { - - if pos_is_winner_scan(pos) || pos_is_draw(pos) { - true - } else { - false - } -} - -fn pos_disp(pos: &Pos) { - - for rk in 0..RANK_SIZE { - for fl in 0..FILE_SIZE { - - let sq: Square = square_make(fl, rk); - - match pos.state[sq as usize] { - Color::Black => print!("# "), - Color::White => print!("O "), - Color::Empty => print!("- "), - } - } - - println!(""); - } - - match pos.turn() { - Color::Black => println!("black to play"), - Color::White => println!("white to play"), - _ => (), - } -} - -fn gen_moves(list : &mut List, pos: &Pos) { - - list.clear(); - - for rk in 0..RANK_SIZE { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - if pos.can_play(sq) { list.add(sq); } - } - } - -} - -fn search(pos : &Pos, depth: i32, endgame: i32) -> Move { - - //println!("call search"); - - let mut new_depth = depth; - - let empties: i32 = pos.count(Color::Empty); - if (empties <= endgame || new_depth > empties ) { new_depth = empties; } - - if(new_depth == empties) { unsafe { Endgame = true; } } - - search_real(pos, -EVAL_INF, EVAL_INF, new_depth, 0) - -} - -fn search_real(pos: &Pos, alpha: i32, beta: i32, depth: i32, ply: i32) -> i32 { - - - assert!(-EVAL_INF <= alpha && alpha < beta && beta <= EVAL_INF); - //println!("call search_real"); - //println!("depth = {}", depth); - //println!("ply = {}", ply); - // leaf? - - //if unsafe { pos_is_winner_avx512(&pos) } != pos_is_winner(&pos) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!!"); } - if pos_is_winner_scan(&pos) { return -EVAL_INF + ply } - //if unsafe { pos_is_winner_avx512(&pos) } { return -EVAL_INF + ply } - - - if pos_is_draw(&pos) { return 0 } - - if depth == 0 { - return eval(&pos, ply) - } - - let p_move_new : [Move; (FILE_SIZE * RANK_SIZE) as usize] = [0; (FILE_SIZE * RANK_SIZE) as usize]; - - let mut list = List { - p_move: p_move_new, - p_size: 0, - }; - - let mut bm: Move = MOVE_NONE; - let mut bs: i32 = SCORE_NONE; - - gen_moves(&mut list, &pos); - - // move loop - - if ply == 0 { list.shuffle(); } - - for i in 0..list.size() { - - if bs < beta { - - let mv: Move = list.p_move[i as usize]; - - let mut new_pos = Pos { - state: pos.state, - p_turn: pos.p_turn, - p_last: pos.p_last, - - bitboard: pos.bitboard, - }; - - //println!("p_last = {}", new_pos.p_last); - - new_pos.do_move(mv); - - //println!("After do _move p_last = {}", new_pos.p_last); - - let sc: i32 = -search_real(&new_pos, -beta, -cmp::max(alpha, bs), depth - 1, ply + 1); - - - //if sc >= 410 || sc <= -410 { - //println!("sc = {} depth = {}-------------------------------", sc, depth); - - //pos_disp(&new_pos); - //} - - - if sc > bs { bm = mv; bs = sc; } - - } - } - - assert!(bm != MOVE_NONE); - assert!(bs >= -EVAL_INF && bs <= EVAL_INF); - - if ply == 0 { bm } else { bs } //best move at the root node, best score elsewhere - //bs -} - -fn result(pos: &Pos) -> i32 { - - if(pos_is_winner_scan(pos)) { - -(FILE_SIZE*RANK_SIZE*100) - } else { - 0 - } -} - - -fn eval(pos: &Pos, ply: i32) -> i32 { - - let atk: Side = pos.turn(); - let def: Side = side_opp(atk); - - //let mut sc: i32 = 0; - - let check_live4: Side = def; - let check_live4_opp: Side = atk; - - //if ply % 2 == 1 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; } - //if ply % 2 == 0 { check_live4 = def; check_live4_opp = atk; } else { check_live4 = atk; check_live4_opp = def; } -/* - if unsafe { check_pattern4_once_avx512(&pos, check_live4) } != (check_patternfile4_once(&pos, check_live4) || check_patternrank4_once(&pos, check_live4) || check_patterndial4_once(&pos, check_live4) || check_patterndiar4_once(&pos, check_live4) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! self "); pos_disp(&pos); } - if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } != (check_patternfile4_once(&pos, check_live4_opp) || check_patternrank4_once(&pos, check_live4_opp) || check_patterndial4_once(&pos, check_live4_opp) || check_patterndiar4_once(&pos, check_live4_opp) ) { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_once !!!!!! opp "); pos_disp(&pos); } - - #[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern4_dead_avx512(&pos, check_live4_opp); - - let mut temp_check: [__mmask16; 5] = [0; 5]; - - for i in 0..5 { - - let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]); - let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]); - let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]); - let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]); - let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]); - let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]); - let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]); - let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]); - let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]); - let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]); - - let check_mask10 = _kor_mask16(check_mask0, check_mask1); - let check_mask11 = _kor_mask16(check_mask2, check_mask3); - let check_mask12 = _kor_mask16(check_mask4, check_mask5); - let check_mask13 = _kor_mask16(check_mask6, check_mask7); - let check_mask14 = _kor_mask16(check_mask8, check_mask9); - - let check_mask16 = _kor_mask16(check_mask10, check_mask11); - let check_mask17 = _kor_mask16(check_mask12, check_mask13); - let check_mask18 = _kor_mask16(check_mask16, check_mask17); - temp_check[i] = _kor_mask16(check_mask18, check_mask14); - - } - - let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]); - let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]); - let check_mask2 = _kor_mask16(check_mask0, check_mask1); - let check_mask3 = _kor_mask16(check_mask2, temp_check[4]); - - let test1: bool = check_patternfile4_dead(&pos, check_live4_opp) || check_patternrank4_dead(&pos, check_live4_opp) || check_patterndial4_dead(&pos, check_live4_opp) || check_patterndiar4_dead(&pos, check_live4_opp); - - let mut test2: bool = true; - - if check_mask3 > 0 { test2 = true; } else { test2 = false; } - - if test1 != test2 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead !!!!!! opp "); pos_disp(&pos); } - } - - #[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern4_dead_avx512(&pos, check_live4); - - let mut count: i32 = 0; - - for i in 0..5 { - for j in 0..4 { - for k in 0..5 { - count += _popcnt32(result[i][j][k] as i32); - } - } - } - - let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4); - let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4); - let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4); - let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4); - - - if (c4f+c4r+c4dl+c4dr) != count { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! file4_dead_count !!!!!! opp org = {}, new = {}", c4f+c4r+c4dl+c4dr, count); pos_disp(&pos); } - } - - #[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern3_live_avx512(&pos, check_live4); - - let mut count: i32 = 0; - - for i in 0..3 { - for j in 0..4 { - for k in 0..5 { - count += _popcnt32(result[i][j][k] as i32); - } - } - } - - let c3f: i32 = check_patternfile3_live_n(&pos, check_live4); - let c3r: i32 = check_patternrank3_live_n(&pos, check_live4); - let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4); - let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4); - - let mut count1: i32 = 0; - - count1 = c3f+c3r+c3dl+c3dr; - - if count != count1 { println!("avx512 wrong!!!!!!!!!!!!!!!!!!!!!!!!!! live3_dead !!!!!! self org = {}, new = {}", count1, count); pos_disp(&pos); } - } -*/ - - if check_patternfile4_once(&pos, check_live4) || - check_patternrank4_once(&pos, check_live4) || - check_patterndial4_once(&pos, check_live4) || - check_patterndiar4_once(&pos, check_live4) { return -4096 } - - //if unsafe { check_pattern4_once_avx512(&pos, check_live4) } { return -4096 } - - if check_patternfile4_once(&pos, check_live4_opp) || - check_patternrank4_once(&pos, check_live4_opp) || - check_patterndial4_once(&pos, check_live4_opp) || - check_patterndiar4_once(&pos, check_live4_opp) { return 2560 } - - //if unsafe { check_pattern4_once_avx512(&pos, check_live4_opp) } { return 2560 } - - if check_patternfile4_dead(&pos, check_live4_opp) || - check_patternrank4_dead(&pos, check_live4_opp) || - check_patterndial4_dead(&pos, check_live4_opp) || - check_patterndiar4_dead(&pos, check_live4_opp) { return 2560 } - - /*#[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern4_dead_avx512(&pos, check_live4_opp); - - let mut temp_check: [__mmask16; 5] = [0; 5]; - - for i in 0..5 { - let check_mask0 = _kor_mask16(result[i][0][0], result[i][0][1]); - let check_mask1 = _kor_mask16(result[i][0][2], result[i][0][3]); - let check_mask2 = _kor_mask16(result[i][0][4], result[i][1][0]); - let check_mask3 = _kor_mask16(result[i][1][1], result[i][1][2]); - let check_mask4 = _kor_mask16(result[i][1][3], result[i][1][4]); - let check_mask5 = _kor_mask16(result[i][2][0], result[i][2][1]); - let check_mask6 = _kor_mask16(result[i][2][2], result[i][2][3]); - let check_mask7 = _kor_mask16(result[i][2][4], result[i][3][0]); - let check_mask8 = _kor_mask16(result[i][3][1], result[i][3][2]); - let check_mask9 = _kor_mask16(result[i][3][3], result[i][3][4]); - - let check_mask10 = _kor_mask16(check_mask0, check_mask1); - let check_mask11 = _kor_mask16(check_mask2, check_mask3); - let check_mask12 = _kor_mask16(check_mask4, check_mask5); - let check_mask13 = _kor_mask16(check_mask6, check_mask7); - let check_mask14 = _kor_mask16(check_mask8, check_mask9); - - let check_mask16 = _kor_mask16(check_mask10, check_mask11); - let check_mask17 = _kor_mask16(check_mask12, check_mask13); - let check_mask18 = _kor_mask16(check_mask16, check_mask17); - temp_check[i] = _kor_mask16(check_mask18, check_mask14); - } - - let check_mask0 = _kor_mask16(temp_check[0], temp_check[1]); - let check_mask1 = _kor_mask16(temp_check[2], temp_check[3]); - let check_mask2 = _kor_mask16(check_mask0, check_mask1); - let check_mask3 = _kor_mask16(check_mask2, temp_check[4]); - - if check_mask3 > 0 { return 2560 } - } -*/ - // 4,3 - let c4f: i32 = check_patternfile4_dead_n(&pos, check_live4); - let c4r: i32 = check_patternrank4_dead_n(&pos, check_live4); - let c4dl: i32 = check_patterndial4_dead_n(&pos, check_live4); - let c4dr: i32 = check_patterndiar4_dead_n(&pos, check_live4); - - let c3f: i32 = check_patternfile3_live_n(&pos, check_live4); - let c3r: i32 = check_patternrank3_live_n(&pos, check_live4); - let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4); - let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4); - - let n_c4: i32 = c4f + c4r + c4dl + c4dr; - - if n_c4 > 1 { return -2048 } - - if n_c4 == 1 && ( c3f+c3r+c3dl+c3dr > 0 ) { return -3048 } - -/* - #[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern4_dead_avx512(&pos, check_live4); - - let mut count4: i32 = 0; - - for i in 0..5 { - for j in 0..4 { - for k in 0..5 { - count4 += _popcnt32(result[i][j][k] as i32); - } - } - } - - if count4 > 1 { return -2048 } - else if count4 == 1 { - - let result = check_pattern3_live_avx512(&pos, check_live4); - - let mut count3: i32 = 0; - - for i in 0..3 { - for j in 0..4 { - for k in 0..5 { - count3 += _popcnt32(result[i][j][k] as i32); - } - } - } - - if count3 > 0 { return -3048 } - } - } - */ - //--------------------------------------------------------------------------- - - let c3f_opp = check_patternfile3_live_n(&pos, check_live4_opp); - let c3r_opp = check_patternrank3_live_n(&pos, check_live4_opp); - let c3dl_opp = check_patterndial3_live_n(&pos, check_live4_opp); - let c3dr_opp = check_patterndiar3_live_n(&pos, check_live4_opp); - if c3f_opp + c3r_opp + c3dl_opp + c3dr_opp > 1 { return 2560 } - - if c3f + c3r + c3dl + c3dr > 1 { return -2048 } - /* - #[target_feature(enable = "avx512f")] - unsafe { - let result = check_pattern3_live_avx512(&pos, check_live4_opp); - - let mut count: i32 = 0; - - for i in 0..3 { - for j in 0..4 { - for k in 0..5 { - count += _popcnt32(result[i][j][k] as i32); - } - } - } - - let c3f: i32 = check_patternfile3_live_n(&pos, check_live4_opp); - let c3r: i32 = check_patternrank3_live_n(&pos, check_live4_opp); - let c3dl: i32 = check_patterndial3_live_n(&pos, check_live4_opp); - let c3dr: i32 = check_patterndiar3_live_n(&pos, check_live4_opp); - - let mut count1: i32 = 0; - - count1 = c3f+c3r+c3dl+c3dr; - - if count1 > 1 { return -2048 } - } -*/ - 0 -} - - -fn check_patternfile4_once(pos: &Pos, sd: Side) -> bool { - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - let idx5 = sq + PATTERNFILE4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patternrank4_once(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - let idx5 = sq + PATTERNRANK4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patterndial4_once(pos: &Pos, sd : Side) -> bool { - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - let idx5 = sq + PATTERNDIAL4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patterndiar4_once(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 5) { - for fl in 5..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - let idx5 = sq + PATTERNDIAR4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patternfile4_dead(pos: &Pos, sd: Side) -> bool { - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patternrank4_dead(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patterndial4_dead(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patterndiar4_dead(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 4..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { return true } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { return true } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { return true } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - - -fn check_patternfile4_dead_n(pos: &Pos, sd: Side) -> i32 { - - let mut n: i32 = 0; - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } - } - } - - n -} - -fn check_patternrank4_dead_n(pos: &Pos, sd: Side) -> i32 { - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } - } - } - - n -} - -fn check_patterndial4_dead_n(pos: &Pos, sd: Side) -> i32 { - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } - } - } - - n -} - -fn check_patterndiar4_dead_n(pos: &Pos, sd: Side) -> i32 { - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 4..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - if val0 == sd && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd { n += 1; } - if val0 == sd && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd { n += 1; } - if val0 == sd && val1 == Color::Empty && val2 == sd && val3 == sd && val4 == sd { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == sd { n += 1; } - } - } - - n -} - - -/*fn check_patternfile3_live(pos: &Pos, sd: Side) -> bool { - - let last_move: Move = pos.p_last; - - let mut n: i32 = 0; - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - } - } - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - let idx5 = sq + PATTERNFILE4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patternrank3_live(pos: &Pos, sd: Side) -> bool { - - let last_move: Move = pos.p_last; - - // let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - let idx5 = sq + PATTERNRANK4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patterndial3_live(pos: &Pos, sd: Side) -> bool { - - let last_move: Move = pos.p_last; - - //let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - let idx5 = sq + PATTERNDIAL4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} - -fn check_patterndiar3_live(pos: &Pos, sd: Side) -> bool { - - let last_move: Move = pos.p_last; - - //let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 4..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { return true } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 5..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - let idx5 = sq + PATTERNDIAR4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { return true } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { return true } - } - } - - false -} -*/ - -fn check_patternfile3_live_n(pos: &Pos, sd: Side) -> i32 { - - let last_move: Move = pos.p_last; - - let mut n: i32 = 0; - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n +=1 ; } - } - } - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - let idx5 = sq + PATTERNFILE4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } - } - } - - n -} - -fn check_patternrank3_live_n(pos: &Pos, sd: Side) -> i32 { - - let last_move: Move = pos.p_last; - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - let idx5 = sq + PATTERNRANK4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } - } - } - - n -} - -fn check_patterndial3_live_n(pos: &Pos, sd: Side) -> i32 { - - let last_move: Move = pos.p_last; - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 0..(FILE_SIZE - 5) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - let idx5 = sq + PATTERNDIAL4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } - } - } - - n -} - -fn check_patterndiar3_live_n(pos: &Pos, sd: Side) -> i32 { - - let last_move: Move = pos.p_last; - - let mut n: i32 = 0; - - for rk in 0..(RANK_SIZE - 4) { - for fl in 4..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == sd && val4 == Color::Empty { n += 1; } - } - } - - for rk in 0..(RANK_SIZE - 5) { - for fl in 5..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - let idx5 = sq + PATTERNDIAR4[5]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - let val5 = pos.state[idx5 as usize]; - - if val0 == Color::Empty && val1 == sd && val2 == sd && val3 == Color::Empty && val4 == sd && val5 == Color::Empty { n += 1; } - if val0 == Color::Empty && val1 == sd && val2 == Color::Empty && val3 == sd && val4 == sd && val5 == Color::Empty { n += 1; } - } - } - - n -} - -#[target_feature(enable = "avx512f")] -unsafe fn pos_is_winner_avx512(pos : &Pos) -> bool { - - let current_side = side_opp(pos.p_turn); - - let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ); - - let answer_mask: __mmask16 = 0b01111111_11111111; - - let coloridx = current_side as usize; - - let mut temp_mask: [[__mmask16; 5]; 4] = [[0; 5]; 4]; - - for dir in 0..4 { - let board0 = _mm512_set_epi32(0, pos.bitboard[coloridx][dir][14], pos.bitboard[coloridx][dir][13], pos.bitboard[coloridx][dir][12], pos.bitboard[coloridx][dir][11], pos.bitboard[coloridx][dir][10], pos.bitboard[coloridx][dir][9], pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); - - let boardf = _mm512_and_epi32(answer, board0); - - temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - - for i in 1..5 { - - let board1 = _mm512_rol_epi32(board0, i); - - let boardf = _mm512_and_epi32(answer, board1); - - temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - } - } - - let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); - let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); - let check_mask2: __mmask16 = _kor_mask16(temp_mask[0][4], temp_mask[1][0]); - let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][1], temp_mask[1][2]); - let check_mask4: __mmask16 = _kor_mask16(temp_mask[1][3], temp_mask[1][4]); - let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); - let check_mask6: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); - let check_mask7: __mmask16 = _kor_mask16(temp_mask[2][4], temp_mask[3][0]); - let check_mask8: __mmask16 = _kor_mask16(temp_mask[3][1], temp_mask[3][2]); - let check_mask9: __mmask16 = _kor_mask16(temp_mask[3][3], temp_mask[3][4]); - - let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); - let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); - let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); - let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); - let check_mask14: __mmask16 = _kor_mask16(check_mask8, check_mask9); - - let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); - let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); - let check_mask18: __mmask16 = _kor_mask16(check_mask16, check_mask17); - let check_mask19: __mmask16 = _kor_mask16(check_mask18, check_mask14); - - if check_mask19 > 0 { return true } else { return false } -} - -#[target_feature(enable = "avx512f")] -unsafe fn check_pattern4_once_avx512(pos : &Pos, sd: Side) -> bool { - - //let current_side = side_opp(sd); - - let answer_color = _mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ); - let answer_empty = _mm512_set1_epi32( (1<<31)| (1<<26) ); - let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26)); - - let answer_mask: __mmask16 = 0b00000001_11111111; - - //let coloridx = current_side as usize; - let coloridx = sd as usize; - let emptyidx = Color::Empty as usize; - - let mut temp_mask: [[__mmask16; 4]; 4] = [[0; 4]; 4]; - - for dir in 0..4 { - - let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); - - let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); - - let boardf1 = _mm512_and_epi32(answer_color, board0);// check sd - let boardf2 = _mm512_and_epi32(answer_empty, board1);// check empty - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - - for i in 1..4 { //only move 3 times - - let board2 = _mm512_rol_epi32(board0, i);//rot sd - let board3 = _mm512_rol_epi32(board1, i);//rot empty - - let boardf1 = _mm512_and_epi32(answer_color, board2); - let boardf2 = _mm512_and_epi32(answer_empty, board3); - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - } - } - - let check_mask0: __mmask16 = _kor_mask16(temp_mask[0][0], temp_mask[0][1]); - let check_mask1: __mmask16 = _kor_mask16(temp_mask[0][2], temp_mask[0][3]); - let check_mask2: __mmask16 = _kor_mask16(temp_mask[1][0], temp_mask[1][1]); - let check_mask3: __mmask16 = _kor_mask16(temp_mask[1][2], temp_mask[1][3]); - let check_mask4: __mmask16 = _kor_mask16(temp_mask[2][0], temp_mask[2][1]); - let check_mask5: __mmask16 = _kor_mask16(temp_mask[2][2], temp_mask[2][3]); - let check_mask6: __mmask16 = _kor_mask16(temp_mask[3][0], temp_mask[3][1]); - let check_mask7: __mmask16 = _kor_mask16(temp_mask[3][2], temp_mask[3][3]); - - let check_mask10: __mmask16 = _kor_mask16(check_mask0, check_mask1); - let check_mask11: __mmask16 = _kor_mask16(check_mask2, check_mask3); - let check_mask12: __mmask16 = _kor_mask16(check_mask4, check_mask5); - let check_mask13: __mmask16 = _kor_mask16(check_mask6, check_mask7); - - let check_mask16: __mmask16 = _kor_mask16(check_mask10, check_mask11); - let check_mask17: __mmask16 = _kor_mask16(check_mask12, check_mask13); - let check_mask19: __mmask16 = _kor_mask16(check_mask16, check_mask17); - - if check_mask19 > 0 { return true } else { return false } -} - -#[target_feature(enable = "avx512f")] -unsafe fn check_pattern4_dead_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 5] { - - //let current_side = side_opp(sd); - - let answer_color: [__m512i; 5] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28)|(1<<27) ), - _mm512_set1_epi32( (1<<31)| (1<<29)|(1<<28)|(1<<27) ), - _mm512_set1_epi32( (1<<31)|(1<<30) |(1<<28)|(1<<27) ), - _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29) |(1<<27) ), - _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28) )]; - - let answer_empty: [__m512i; 5]= [_mm512_set1_epi32( (1<<31) ), - _mm512_set1_epi32( (1<<30) ), - _mm512_set1_epi32( (1<<29) ), - _mm512_set1_epi32( (1<<28) ), - _mm512_set1_epi32( (1<<27) )]; - - let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)); - - let answer_mask: __mmask16 = 0b00000001_11111111; - - //let coloridx = current_side as usize; - let coloridx = sd as usize; - let emptyidx = Color::Empty as usize; - - let mut temp_mask: [[[__mmask16; 5]; 4]; 5] = [[[0; 5]; 4]; 5]; - - for pattern in 0..5 { - - for dir in 0..4 { - - let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); - - let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); - - let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd - let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - - for i in 1..5 { //only move 4 times - - let board2 = _mm512_rol_epi32(board0, i);//rot sd - let board3 = _mm512_rol_epi32(board1, i);//rot empty - - let boardf1 = _mm512_and_epi32(answer_color[pattern], board2); - let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3); - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer, boardf, 0);//no need answer_mask, because and above - } - } - } - - temp_mask -} - - -#[target_feature(enable = "avx512f")] -unsafe fn check_pattern3_live_avx512(pos : &Pos, sd: Side) -> [[[__mmask16; 5]; 4]; 3] { - - //let current_side = side_opp(sd); - - let answer_color: [__m512i; 3] = [_mm512_set1_epi32( (1<<30)|(1<<29)|(1<<28) ), - _mm512_set1_epi32( (1<<30)| (1<<28)|(1<<27) ), - _mm512_set1_epi32( (1<<30)|(1<<29) |(1<<27) )]; - - let answer_empty: [__m512i; 3]= [_mm512_set1_epi32( (1<<31)| (1<<27) ), - _mm512_set1_epi32( (1<<31)| (1<<29)| (1<<26) ), - _mm512_set1_epi32( (1<<31)| (1<<28)| (1<<26) )]; - - //let answer = _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)); - let answer: [__m512i; 3] = [_mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27) ), - _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) ), - _mm512_set1_epi32( (1<<31)|(1<<30)|(1<<29)|(1<<28)|(1<<27)|(1<<26) )]; - - let answer_mask: __mmask16 = 0b00000001_11111111; - - //let coloridx = current_side as usize; - let coloridx = sd as usize; - let emptyidx = Color::Empty as usize; - - let mut temp_mask: [[[__mmask16; 5]; 4]; 3] = [[[0; 5]; 4]; 3]; - - for pattern in 0..3 { - - for dir in 0..4 { - - let board0 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[coloridx][dir][8], pos.bitboard[coloridx][dir][7], pos.bitboard[coloridx][dir][6], pos.bitboard[coloridx][dir][5], pos.bitboard[coloridx][dir][4], pos.bitboard[coloridx][dir][3], pos.bitboard[coloridx][dir][2], pos.bitboard[coloridx][dir][1], pos.bitboard[coloridx][dir][0]); - - let board1 = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, pos.bitboard[emptyidx][dir][8], pos.bitboard[emptyidx][dir][7], pos.bitboard[emptyidx][dir][6], pos.bitboard[emptyidx][dir][5], pos.bitboard[emptyidx][dir][4], pos.bitboard[emptyidx][dir][3], pos.bitboard[emptyidx][dir][2], pos.bitboard[emptyidx][dir][1], pos.bitboard[emptyidx][dir][0]); - - let boardf1 = _mm512_and_epi32(answer_color[pattern], board0);// check sd - let boardf2 = _mm512_and_epi32(answer_empty[pattern], board1);// check empty - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[pattern][dir][0] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above - - for i in 1..5 { //only move 4 times - - let board2 = _mm512_rol_epi32(board0, i);//rot sd - let board3 = _mm512_rol_epi32(board1, i);//rot empty - - let boardf1 = _mm512_and_epi32(answer_color[pattern], board2); - let boardf2 = _mm512_and_epi32(answer_empty[pattern], board3); - let boardf = _mm512_or_epi32(boardf1, boardf2); - - temp_mask[pattern][dir][i as usize] = _mm512_mask_cmp_epi32_mask(answer_mask, answer[pattern], boardf, 0);//no need answer_mask, because and above - } - } - } - - temp_mask -} - -fn check_patternfile5(pos: &Pos, sd: Side) -> bool { - - for rk in 0..RANK_SIZE { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNFILE4[0]; - let idx1 = sq + PATTERNFILE4[1]; - let idx2 = sq + PATTERNFILE4[2]; - let idx3 = sq + PATTERNFILE4[3]; - let idx4 = sq + PATTERNFILE4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patternrank5(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNRANK4[0]; - let idx1 = sq + PATTERNRANK4[1]; - let idx2 = sq + PATTERNRANK4[2]; - let idx3 = sq + PATTERNRANK4[3]; - let idx4 = sq + PATTERNRANK4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patterndial5(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 0..(FILE_SIZE - 4) { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAL4[0]; - let idx1 = sq + PATTERNDIAL4[1]; - let idx2 = sq + PATTERNDIAL4[2]; - let idx3 = sq + PATTERNDIAL4[3]; - let idx4 = sq + PATTERNDIAL4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn check_patterndiar5(pos: &Pos, sd: Side) -> bool { - - for rk in 0..(RANK_SIZE - 4) { - for fl in 4..FILE_SIZE { - let sq : Square = square_make(fl, rk); - - let idx0 = sq + PATTERNDIAR4[0]; - let idx1 = sq + PATTERNDIAR4[1]; - let idx2 = sq + PATTERNDIAR4[2]; - let idx3 = sq + PATTERNDIAR4[3]; - let idx4 = sq + PATTERNDIAR4[4]; - - let val0 = pos.state[idx0 as usize]; - let val1 = pos.state[idx1 as usize]; - let val2 = pos.state[idx2 as usize]; - let val3 = pos.state[idx3 as usize]; - let val4 = pos.state[idx4 as usize]; - - if val0 == sd && val1 == sd && val2 == sd && val3 == sd && val4 == sd { return true } - } - } - - false -} - -fn main() { - - loop - { - - let start = Instant::now(); - - println!("Hello, this is connect 6!"); - - //unsafe { test_avx512(); } - - let test_state: [Color; SQUARE_SIZE as usize] = [Color::Empty; SQUARE_SIZE as usize]; - - let test_bitboard: [[[i32; FILE_SIZE as usize]; 4]; 3] = [[[0; FILE_SIZE as usize]; 4]; 3]; - - let mut test1 = Pos { - state: test_state, - p_turn: Color::Black, - p_last: square_make(5,5), - - bitboard: test_bitboard, - }; - - test1.init(); - - //pos_disp(&test1); - - for i in 0..(FILE_SIZE*RANK_SIZE) { - - // println!("----------------------------------------\n\n\n\n"); - // println!("MOVE {}!!!!\n\n\n\n", i); - - - let mut d = 2; - let mut e = 4; - - //if i < 6 { d = 1; e = 2; } - - let next_move: Move = search(&test1, d, e); - //println!("next move is {}", next_move); - //println!("file is {}", square_file(next_move)); - //println!("rank is {}", square_rank(next_move)); - - test1.do_move(next_move); - - //pos_disp(&test1); - - if pos_is_end(&test1) { - - println!("Game over!!!!!!"); - println!("MOVE {}!!!!\n", i); - //pos_disp(&test1); - - break; } - } - - - let duration = start.elapsed(); - - println!("Time elapsed in expensive_function() is: {:?}", duration); - } - - -} From 38c4f1da99c63acfed6985623074331d1184da5c Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 14:28:45 +0000 Subject: [PATCH 16/25] mm_mask_sub: ss,sd; mm_mask_mul: ss,sd; mm_mask_div: ss,sd --- crates/core_arch/avx512f.md | 24 +- crates/core_arch/src/x86/avx512f.rs | 360 ++++++++++++++++++++++++++++ 2 files changed, 372 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 01328a43f8..1f0f9c1460 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1252,8 +1252,8 @@ * [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236) * [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236) * [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236) - * [ ] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236) - * [ ] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236) + * [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236) + * [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236) * [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236) * [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236) * [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236) @@ -1296,8 +1296,8 @@ * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236) * [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236) * [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236) - * [ ] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236) - * [ ] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236) + * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236) + * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236) * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236) * [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236) * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236) @@ -1318,8 +1318,8 @@ * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236) * [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236) * [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236) - * [ ] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236) - * [ ] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236) + * [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236) + * [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236) * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236) * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236) * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236) @@ -1330,8 +1330,8 @@ * [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236) * [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236) * [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236) - * [ ] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236) - * [ ] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236) + * [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236) + * [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236) * [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236) * [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236) * [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236) @@ -1374,8 +1374,8 @@ * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236) * [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236) * [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236) - * [ ] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236) - * [ ] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236) + * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236) + * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236) * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236) * [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236) * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236) @@ -1394,8 +1394,8 @@ * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236) * [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236) * [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236) - * [ ] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236) - * [ ] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236) + * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236) + * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236) * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236) * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236) * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 7804a8ce44..852abfe349 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18487,6 +18487,216 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { transmute(r) } +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss))] +pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta - extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss))] +pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta - extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd))] +pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta - extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd))] +pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta - extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss))] +pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta * extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss))] +pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta * extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd))] +pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta * extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd))] +pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta * extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss))] +pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut add: f32 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta / extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss))] +pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut add: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + add = extracta / extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd))] +pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut add: f64 = extractsrc; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta / extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd))] +pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut add: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + add = extracta / extractb; + } + let r = simd_insert(a, 0, add); + transmute(r) +} + /// Equal pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; /// Less-than @@ -28122,4 +28332,154 @@ mod tests { let e = _mm_set_pd(1., 6.); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sub_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_sub_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_sub_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., -20.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sub_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_sub_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_sub_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., -20.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sub_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_sub_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_sub_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., -2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sub_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_sub_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_sub_sd(0b11111111, a, b); + let e = _mm_set_pd(1., -2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_mul_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_mul_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_mul_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 800.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_mul_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_mul_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_mul_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 800.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_mul_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_mul_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_mul_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_mul_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_mul_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_mul_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_div_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_div_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_div_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_div_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_div_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_div_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_div_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_div_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_div_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_div_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_div_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_div_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } } From 07a2ee9a1b35d05a6c6adab21c486c4453360a48 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 15:40:40 +0000 Subject: [PATCH 17/25] add_round: ss,sd; sub_round: ss,sd; mul_round: ss,sd; div_round: ss,sd --- crates/core_arch/avx512f.md | 44 +- crates/core_arch/src/x86/avx512f.rs | 1053 +++++++++++++++++++++++++++ crates/core_arch/src/x86/mod.rs | 18 + 3 files changed, 1093 insertions(+), 22 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 1f0f9c1460..cd93fa26b9 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1133,8 +1133,8 @@ * [x] [`_mm512_zextps256_ps512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextps256_ps512&expand=5236) * [x] [`_mm512_zextsi128_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi128_si512&expand=5236) * [x] [`_mm512_zextsi256_si512`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextsi256_si512&expand=5236) - * [ ] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236) - * [ ] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236) + * [x] [`_mm_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=5236) + * [x] [`_mm_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=5236) * [x] [`_mm_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=5236) * [x] [`_mm_cmp_round_ss_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=5236) * [x] [`_mm_cmp_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=5236) @@ -1200,8 +1200,8 @@ * [ ] [`_mm_cvtu32_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=5236) * [ ] [`_mm_cvtu64_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_sd&expand=5236) * [ ] [`_mm_cvtu64_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu64_ss&expand=5236) - * [ ] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236) - * [ ] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236) + * [x] [`_mm_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sd&expand=5236) + * [x] [`_mm_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_ss&expand=5236) * [ ] [`_mm_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_sd&expand=5236) * [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236) * [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236) @@ -1238,8 +1238,8 @@ * [ ] [`_mm_mask3_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_ss&expand=5236) * [ ] [`_mm_mask3_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sd&expand=5236) * [ ] [`_mm_mask3_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ss&expand=5236) - * [ ] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236) - * [ ] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236) + * [x] [`_mm_mask_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sd&expand=5236) + * [x] [`_mm_mask_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_ss&expand=5236) * [x] [`_mm_mask_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sd&expand=5236) * [x] [`_mm_mask_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ss&expand=5236) * [x] [`_mm_mask_cmp_round_sd_mask`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=5236) @@ -1250,8 +1250,8 @@ * [ ] [`_mm_mask_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sd&expand=5236) * [ ] [`_mm_mask_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_ss&expand=5236) * [ ] [`_mm_mask_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sd&expand=5236) - * [ ] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236) - * [ ] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236) + * [x] [`_mm_mask_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sd&expand=5236) + * [x] [`_mm_mask_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_ss&expand=5236) * [x] [`_mm_mask_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sd&expand=5236) * [x] [`_mm_mask_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ss&expand=5236) * [ ] [`_mm_mask_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_sd&expand=5236) @@ -1316,20 +1316,20 @@ * [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236) * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236) * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236) - * [ ] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236) - * [ ] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236) + * [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236) + * [x] [`_mm_mask_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_ss&expand=5236) * [x] [`_mm_mask_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sd&expand=5236) * [x] [`_mm_mask_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ss&expand=5236) - * [ ] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236) - * [ ] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236) + * [x] [`_mm_maskz_add_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=5236) + * [x] [`_mm_maskz_add_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=5236) * [x] [`_mm_maskz_add_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sd&expand=5236) * [x] [`_mm_maskz_add_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ss&expand=5236) * [ ] [`_mm_maskz_cvt_roundsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_ss&expand=5236) * [ ] [`_mm_maskz_cvt_roundss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sd&expand=5236) * [ ] [`_mm_maskz_cvtsd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_ss&expand=5236) * [ ] [`_mm_maskz_cvtss_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sd&expand=5236) - * [ ] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236) - * [ ] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236) + * [x] [`_mm_maskz_div_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sd&expand=5236) + * [x] [`_mm_maskz_div_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_ss&expand=5236) * [x] [`_mm_maskz_div_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sd&expand=5236) * [x] [`_mm_maskz_div_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ss&expand=5236) * [ ] [`_mm_maskz_fixupimm_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_sd&expand=5236) @@ -1372,8 +1372,8 @@ * [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236) * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236) * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236) - * [ ] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236) - * [ ] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236) + * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236) + * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236) * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236) * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236) * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236) @@ -1392,16 +1392,16 @@ * [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236) * [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236) * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236) - * [ ] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236) - * [ ] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236) + * [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236) + * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236) * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236) * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236) * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236) * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236) * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236) * [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236) - * [ ] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236) - * [ ] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236) + * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236) + * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236) * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236) * [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236) * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236) @@ -1416,7 +1416,7 @@ * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236) * [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236) * [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236) - * [ ] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236) - * [ ] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236) + * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236) + * [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236)

diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 852abfe349..4294ec92b5 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18697,6 +18697,722 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { transmute(r) } +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_add_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_add_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_add_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sub_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sub_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sub_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmulss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_mul_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmulss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmulsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_mul_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_mul_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmulsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vdivss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_div_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vdivss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vdivsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_div_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_div_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vdivsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + /// Equal pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; /// Less-than @@ -19299,6 +20015,23 @@ extern "C" { fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16; #[link_name = "llvm.x86.avx512.mask.expand.pd.512"] fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8; + + #[link_name = "llvm.x86.avx512.mask.add.ss.round"] + fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.add.sd.round"] + fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.sub.ss.round"] + fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.sub.sd.round"] + fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.mul.ss.round"] + fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.mul.sd.round"] + fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.div.ss.round"] + fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.div.sd.round"] + fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; } #[cfg(test)] @@ -28482,4 +29215,324 @@ mod tests { let e = _mm_set_pd(1., 0.5); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_add_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 60.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_add_round_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_add_round_ss( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 2., 10., 60.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_add_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 60.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_add_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_add_round_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_add_round_sd( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_add_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 6.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_sub_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., -20.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sub_round_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_sub_round_ss( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 2., 10., -20.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sub_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., -20.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_sub_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., -2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sub_round_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_sub_round_sd( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., -2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sub_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., -2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mul_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 800.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_mul_round_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_mul_round_ss( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 2., 10., 800.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_mul_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 800.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mul_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_mul_round_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_mul_round_sd( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_mul_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_div_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_div_round_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_div_round_ss( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_div_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_div_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_div_round_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_div_round_sd( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_div_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } } diff --git a/crates/core_arch/src/x86/mod.rs b/crates/core_arch/src/x86/mod.rs index 41cbd5029a..80dad4d64e 100644 --- a/crates/core_arch/src/x86/mod.rs +++ b/crates/core_arch/src/x86/mod.rs @@ -451,6 +451,24 @@ impl m128Ext for __m128 { } } +#[allow(non_camel_case_types)] +#[unstable(feature = "stdsimd_internal", issue = "none")] +pub(crate) trait m128dExt: Sized { + fn as_m128d(self) -> __m128d; + + #[inline] + fn as_f64x2(self) -> crate::core_arch::simd::f64x2 { + unsafe { transmute(self.as_m128d()) } + } +} + +impl m128dExt for __m128d { + #[inline] + fn as_m128d(self) -> Self { + self + } +} + #[allow(non_camel_case_types)] #[unstable(feature = "stdsimd_internal", issue = "none")] pub(crate) trait m256Ext: Sized { From 11b2b65badfcba141800a9486ade247a6768bb23 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 16:06:53 +0000 Subject: [PATCH 18/25] mask_sqrt: ss,sd; sqrt_round: ss,sd; --- crates/core_arch/avx512f.md | 20 +- crates/core_arch/src/x86/avx512f.rs | 377 ++++++++++++++++++++++++++++ 2 files changed, 387 insertions(+), 10 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index cd93fa26b9..6224ef09c2 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1310,10 +1310,10 @@ * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236) * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236) * [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236) - * [ ] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236) - * [ ] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236) - * [ ] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236) - * [ ] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236) + * [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236) + * [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236) + * [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236) + * [x] [`_mm_mask_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ss&expand=5236) * [ ] [`_mm_mask_store_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sd&expand=5236) * [ ] [`_mm_mask_store_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ss&expand=5236) * [x] [`_mm_mask_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sd&expand=5236) @@ -1388,10 +1388,10 @@ * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236) * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236) * [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236) - * [ ] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236) - * [ ] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236) - * [ ] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236) - * [ ] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236) + * [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236) + * [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236) + * [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236) + * [x] [`_mm_maskz_sqrt_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ss&expand=5236) * [x] [`_mm_maskz_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sd&expand=5236) * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236) * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236) @@ -1414,8 +1414,8 @@ * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236) * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236) * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236) - * [ ] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236) - * [ ] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236) + * [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236) + * [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236) * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236) * [x] [`_mm_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_ss&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 4294ec92b5..6ec24d2a00 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18697,6 +18697,70 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { transmute(r) } +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss))] +pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss))] +pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd))] +pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd))] +pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + )) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -19413,6 +19477,185 @@ pub unsafe fn _mm_maskz_div_round_sd( transmute(constify_imm4_round!(rounding, call)) } +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sqrt_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsqrtss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sqrt_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sqrt_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsqrtsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + /// Equal pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; /// Less-than @@ -20032,6 +20275,10 @@ extern "C" { fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.div.sd.round"] fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.sqrt.ss"] + fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.sqrt.sd"] + fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; } #[cfg(test)] @@ -29216,6 +29463,56 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sqrt_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_mask_sqrt_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 2.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sqrt_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_maskz_sqrt_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_sqrt_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 2.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sqrt_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_sqrt_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sqrt_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_sqrt_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_sqrt_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 2.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_add_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); @@ -29535,4 +29832,84 @@ mod tests { let e = _mm_set_pd(1., 0.5); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_sqrt_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 2.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sqrt_round_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_sqrt_round_ss( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 2., 10., 2.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sqrt_round_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 2., 10., 2.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_sqrt_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_sqrt_round_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_sqrt_round_sd( + src, + 0b11111111, + a, + b, + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 2.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_sqrt_round_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 2.); + assert_eq_m128d(r, e); + } } From 1afd1b92d9b2f588338dd9e1fdf8e66ab8841d01 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 16:37:19 +0000 Subject: [PATCH 19/25] mask_max: ss,sd; mask_min: ss,sd; max_round: ss,sd; min_round: ss,sd; --- crates/core_arch/avx512f.md | 44 +- crates/core_arch/src/x86/avx512f.rs | 640 ++++++++++++++++++++++++++++ 2 files changed, 662 insertions(+), 22 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 6224ef09c2..701b65d560 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1284,18 +1284,18 @@ * [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236) * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236) * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236) - * [ ] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236) - * [ ] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236) - * [ ] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236) - * [ ] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236) - * [ ] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236) - * [ ] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236) - * [ ] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236) - * [ ] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236) + * [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236) + * [x] [`_mm_mask_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_ss&expand=5236) + * [x] [`_mm_mask_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sd&expand=5236) + * [x] [`_mm_mask_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ss&expand=5236) + * [x] [`_mm_mask_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sd&expand=5236) + * [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236) + * [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236) + * [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236) * [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236) * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236) - * [ ] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236) - * [ ] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236) + * [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236) + * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236) * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236) * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236) * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236) @@ -1362,14 +1362,14 @@ * [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236) * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236) * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236) - * [ ] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236) - * [ ] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236) - * [ ] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236) - * [ ] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236) - * [ ] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236) - * [ ] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236) - * [ ] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236) - * [ ] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236) + * [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236) + * [x] [`_mm_maskz_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_ss&expand=5236) + * [x] [`_mm_maskz_max_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sd&expand=5236) + * [x] [`_mm_maskz_max_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ss&expand=5236) + * [x] [`_mm_maskz_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sd&expand=5236) + * [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236) + * [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236) + * [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236) * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236) * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236) * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236) @@ -1396,10 +1396,10 @@ * [x] [`_mm_maskz_sub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_ss&expand=5236) * [x] [`_mm_maskz_sub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sd&expand=5236) * [x] [`_mm_maskz_sub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ss&expand=5236) - * [ ] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236) - * [ ] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236) - * [ ] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236) - * [ ] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236) + * [x] [`_mm_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sd&expand=5236) + * [x] [`_mm_max_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_ss&expand=5236) + * [x] [`_mm_min_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sd&expand=5236) + * [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236) * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236) * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236) * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 6ec24d2a00..185fba00e9 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18697,6 +18697,134 @@ pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { transmute(r) } +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss))] +pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vmaxss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss))] +pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vmaxss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd))] +pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd))] +pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss))] +pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vminss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss))] +pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vminss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd))] +pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vminsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd))] +pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vminsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387) @@ -19477,6 +19605,282 @@ pub unsafe fn _mm_maskz_div_round_sd( transmute(constify_imm4_round!(rounding, call)) } +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmaxss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_max_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + sae: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vmaxss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_max_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vmaxsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vminss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_min_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + sae: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vminss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vminsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_min_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vminsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_sae!(sae, call)) +} + /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -20275,6 +20679,14 @@ extern "C" { fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.div.sd.round"] fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.max.ss.round"] + fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.max.sd.round"] + fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.min.ss.round"] + fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.min.sd.round"] + fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2; #[link_name = "llvm.x86.avx512.mask.sqrt.ss"] fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.sqrt.sd"] @@ -29463,6 +29875,102 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_max_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_mask_max_ss(a, 0, a, b); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + let r = _mm_mask_max_ss(a, 0b11111111, a, b); + let e = _mm_set_ps(0., 1., 2., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_max_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_maskz_max_ss(0, a, b); + let e = _mm_set_ps(0., 1., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_max_ss(0b11111111, a, b); + let e = _mm_set_ps(0., 1., 2., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_max_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_mask_max_sd(a, 0, a, b); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_max_sd(a, 0b11111111, a, b); + let e = _mm_set_pd(0., 3.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_max_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_maskz_max_sd(0, a, b); + let e = _mm_set_pd(0., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_max_sd(0b11111111, a, b); + let e = _mm_set_pd(0., 3.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_min_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_mask_min_ss(a, 0, a, b); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + let r = _mm_mask_min_ss(a, 0b11111111, a, b); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_min_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_maskz_min_ss(0, a, b); + let e = _mm_set_ps(0., 1., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_min_ss(0b11111111, a, b); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_min_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_mask_min_sd(a, 0, a, b); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_min_sd(a, 0b11111111, a, b); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_min_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_maskz_min_sd(0, a, b); + let e = _mm_set_pd(0., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_min_sd(0b11111111, a, b); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_mask_sqrt_ss() { let src = _mm_set_ps(10., 11., 100., 110.); @@ -29833,6 +30341,138 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_max_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_max_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_max_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_max_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 3.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_max_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 3.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_max_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 3.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_min_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_min_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_min_round_ss() { + let a = _mm_set_ps(0., 1., 2., 3.); + let b = _mm_set_ps(4., 5., 6., 7.); + let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(0., 1., 2., 3.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_min_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_min_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_min_round_sd() { + let a = _mm_set_pd(0., 1.); + let b = _mm_set_pd(2., 3.); + let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(0., 1.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_sqrt_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); From eb58b50a418d547b0a1c518641568e0fa6aaf1fa Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 17:20:07 +0000 Subject: [PATCH 20/25] rsqrt14: ss,sd; rcp14: ss,sd --- crates/core_arch/avx512f.md | 24 +-- crates/core_arch/src/x86/avx512f.rs | 305 ++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 701b65d560..6483ea7460 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1298,14 +1298,14 @@ * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236) * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236) * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236) - * [ ] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236) - * [ ] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236) + * [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236) + * [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236) * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236) * [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236) * [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236) * [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236) - * [ ] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236) - * [ ] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236) + * [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236) + * [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236) * [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236) * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236) * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236) @@ -1376,14 +1376,14 @@ * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236) * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236) * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236) - * [ ] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236) - * [ ] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236) + * [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236) + * [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236) * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236) * [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236) * [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236) * [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236) - * [ ] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236) - * [ ] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236) + * [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236) + * [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236) * [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236) * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236) * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236) @@ -1402,14 +1402,14 @@ * [x] [`_mm_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_ss&expand=5236) * [x] [`_mm_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sd&expand=5236) * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236) - * [ ] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236) - * [ ] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236) + * [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236) + * [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236) * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236) * [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236) * [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236) * [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236) - * [ ] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236) - * [ ] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236) + * [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236) + * [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236) * [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236) * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236) * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 185fba00e9..8c93b318dc 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18889,6 +18889,166 @@ pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d )) } +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + )) +} + +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) +} + +/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14ss))] +pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrsqrt14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + )) +} + +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + )) +} + +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) +} + +/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrsqrt14sd))] +pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrsqrt14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + )) +} + +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + )) +} + +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k)) +} + +/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14ss))] +pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vrcp14ss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + )) +} + +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + )) +} + +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k)) +} + +/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrcp14sd))] +pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vrcp14sd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + )) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -20691,6 +20851,15 @@ extern "C" { fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.sqrt.sd"] fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + + #[link_name = "llvm.x86.avx512.rsqrt14.ss"] + fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4; + #[link_name = "llvm.x86.avx512.rsqrt14.sd"] + fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2; + #[link_name = "llvm.x86.avx512.rcp14.ss"] + fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4; + #[link_name = "llvm.x86.avx512.rcp14.sd"] + fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2; } #[cfg(test)] @@ -30021,6 +30190,142 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_rsqrt14_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_rsqrt14_ss(a, b); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_rsqrt14_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_mask_rsqrt14_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_rsqrt14_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_maskz_rsqrt14_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.5); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_rsqrt14_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_rsqrt14_sd(a, b); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_rsqrt14_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_rsqrt14_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_rsqrt14_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_rsqrt14_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 0.5); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_rcp14_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_rcp14_ss(a, b); + let e = _mm_set_ps(1., 2., 10., 0.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_rcp14_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_mask_rcp14_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_rcp14_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 4.); + let r = _mm_maskz_rcp14_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_rcp14_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 0.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_rcp14_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_rcp14_sd(a, b); + let e = _mm_set_pd(1., 0.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_rcp14_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_rcp14_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 0.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_rcp14_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_rcp14_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_rcp14_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 0.25); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_add_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); From b8a68500fcd74c239820a35023ccfab809ed9f99 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 18:08:10 +0000 Subject: [PATCH 21/25] getexp: ss,sd; getexp_round: ss,sd; getmant: ss,sd; getmant_round: ss,sd; --- crates/core_arch/avx512f.md | 48 +- crates/core_arch/src/x86/avx512f.rs | 1896 +++++++++++++++++++++------ 2 files changed, 1507 insertions(+), 437 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 6483ea7460..5b2aaa1c21 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1214,14 +1214,14 @@ * [ ] [`_mm_fnmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_ss&expand=5236) * [ ] [`_mm_fnmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sd&expand=5236) * [ ] [`_mm_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_ss&expand=5236) - * [ ] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236) - * [ ] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236) - * [ ] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236) - * [ ] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236) - * [ ] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236) - * [ ] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236) - * [ ] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236) - * [ ] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236) + * [x] [`_mm_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sd&expand=5236) + * [x] [`_mm_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_ss&expand=5236) + * [x] [`_mm_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sd&expand=5236) + * [x] [`_mm_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ss&expand=5236) + * [x] [`_mm_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sd&expand=5236) + * [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236) + * [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236) + * [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236) * [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236) * [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236) * [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236) @@ -1274,14 +1274,14 @@ * [ ] [`_mm_mask_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_ss&expand=5236) * [ ] [`_mm_mask_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sd&expand=5236) * [ ] [`_mm_mask_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ss&expand=5236) - * [ ] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236) - * [ ] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236) - * [ ] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236) - * [ ] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236) - * [ ] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236) - * [ ] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236) - * [ ] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236) - * [ ] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236) + * [x] [`_mm_mask_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sd&expand=5236) + * [x] [`_mm_mask_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_ss&expand=5236) + * [x] [`_mm_mask_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sd&expand=5236) + * [x] [`_mm_mask_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ss&expand=5236) + * [x] [`_mm_mask_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sd&expand=5236) + * [x] [`_mm_mask_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_ss&expand=5236) + * [x] [`_mm_mask_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sd&expand=5236) + * [x] [`_mm_mask_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ss&expand=5236) * [ ] [`_mm_mask_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sd&expand=5236) * [ ] [`_mm_mask_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ss&expand=5236) * [x] [`_mm_mask_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sd&expand=5236) @@ -1352,14 +1352,14 @@ * [ ] [`_mm_maskz_fnmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_ss&expand=5236) * [ ] [`_mm_maskz_fnmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sd&expand=5236) * [ ] [`_mm_maskz_fnmsub_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ss&expand=5236) - * [ ] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236) - * [ ] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236) - * [ ] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236) - * [ ] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236) - * [ ] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236) - * [ ] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236) - * [ ] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236) - * [ ] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236) + * [x] [`_mm_maskz_getexp_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sd&expand=5236) + * [x] [`_mm_maskz_getexp_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_ss&expand=5236) + * [x] [`_mm_maskz_getexp_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sd&expand=5236) + * [x] [`_mm_maskz_getexp_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ss&expand=5236) + * [x] [`_mm_maskz_getmant_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sd&expand=5236) + * [x] [`_mm_maskz_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_ss&expand=5236) + * [x] [`_mm_maskz_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sd&expand=5236) + * [x] [`_mm_maskz_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ss&expand=5236) * [ ] [`_mm_maskz_load_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sd&expand=5236) * [ ] [`_mm_maskz_load_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ss&expand=5236) * [x] [`_mm_maskz_max_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 8c93b318dc..24b6245e17 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -19049,6 +19049,342 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d )) } +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + _MM_FROUND_NO_EXC, + )) +} + +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_NO_EXC, + )) +} + +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpss))] +pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vgetexpss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_NO_EXC, + )) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + _MM_FROUND_NO_EXC, + )) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_NO_EXC, + )) +} + +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetexpsd))] +pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vgetexpsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_NO_EXC, + )) +} + +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm_getmant_ss( + a: __m128, + b: __m128, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128 { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantss( + a.as_f32x4(), + b.as_f32x4(), + $imm2 << 2 | $imm4_1, + _mm_setzero_ps().as_f32x4(), + 0b1, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm_mask_getmant_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128 { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantss( + a.as_f32x4(), + b.as_f32x4(), + $imm2 << 2 | $imm4_1, + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm_maskz_getmant_ss( + k: __mmask8, + a: __m128, + b: __m128, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128 { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantss( + a.as_f32x4(), + b.as_f32x4(), + $imm2 << 2 | $imm4_1, + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm_getmant_sd( + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + _mm_setzero_pd().as_f64x2(), + 0b1, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm_mask_getmant_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm_maskz_getmant_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm4_mantissas!(norm, sign, call); + transmute(r) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -19058,15 +19394,373 @@ pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_add_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vaddss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_add_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_add_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vaddsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sub_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vsubss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_sub_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_sub_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { + macro_rules! call { + ($imm4:expr) => { + vsubsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm4, + ) + }; + } + transmute(constify_imm4_round!(rounding, call)) +} + +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vaddss( + vmulss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19078,7 +19772,7 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { transmute(constify_imm4_round!(rounding, call)) } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19087,12 +19781,12 @@ pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_add_round_ss( +pub unsafe fn _mm_mask_mul_round_ss( src: __m128, k: __mmask8, a: __m128, @@ -19101,13 +19795,13 @@ pub unsafe fn _mm_mask_add_round_ss( ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } transmute(constify_imm4_round!(rounding, call)) } -/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19116,15 +19810,15 @@ pub unsafe fn _mm_mask_add_round_ss( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddss, rounding = 8))] +#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vaddss( + vmulss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19136,7 +19830,7 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding transmute(constify_imm4_round!(rounding, call)) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19145,15 +19839,15 @@ pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { +pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vaddsd( + vmulsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19165,7 +19859,7 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d transmute(constify_imm4_round!(rounding, call)) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19174,12 +19868,12 @@ pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_add_round_sd( +pub unsafe fn _mm_mask_mul_round_sd( src: __m128d, k: __mmask8, a: __m128d, @@ -19188,13 +19882,13 @@ pub unsafe fn _mm_mask_add_round_sd( ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } transmute(constify_imm4_round!(rounding, call)) } -/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19203,12 +19897,12 @@ pub unsafe fn _mm_mask_add_round_sd( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vaddsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_add_round_sd( +pub unsafe fn _mm_maskz_mul_round_sd( k: __mmask8, a: __m128d, b: __m128d, @@ -19216,7 +19910,7 @@ pub unsafe fn _mm_maskz_add_round_sd( ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vaddsd( + vmulsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19228,7 +19922,7 @@ pub unsafe fn _mm_maskz_add_round_sd( transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19237,15 +19931,15 @@ pub unsafe fn _mm_maskz_add_round_sd( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vsubss( + vdivss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19257,7 +19951,7 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19266,12 +19960,12 @@ pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_sub_round_ss( +pub unsafe fn _mm_mask_div_round_ss( src: __m128, k: __mmask8, a: __m128, @@ -19280,13 +19974,13 @@ pub unsafe fn _mm_mask_sub_round_ss( ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19295,15 +19989,15 @@ pub unsafe fn _mm_mask_sub_round_ss( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubss, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vsubss( + vdivss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19315,7 +20009,7 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19324,15 +20018,15 @@ pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { +pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vsubsd( + vdivsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19344,7 +20038,7 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19353,12 +20047,12 @@ pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_sub_round_sd( +pub unsafe fn _mm_mask_div_round_sd( src: __m128d, k: __mmask8, a: __m128d, @@ -19367,13 +20061,13 @@ pub unsafe fn _mm_mask_sub_round_sd( ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } transmute(constify_imm4_round!(rounding, call)) } -/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: /// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions @@ -19382,12 +20076,12 @@ pub unsafe fn _mm_mask_sub_round_sd( /// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions /// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsubsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_sub_round_sd( +pub unsafe fn _mm_maskz_div_round_sd( k: __mmask8, a: __m128d, b: __m128d, @@ -19395,7 +20089,7 @@ pub unsafe fn _mm_maskz_sub_round_sd( ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vsubsd( + vdivsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19407,24 +20101,18 @@ pub unsafe fn _mm_maskz_sub_round_sd( transmute(constify_imm4_round!(rounding, call)) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmulss( + vmaxss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19433,56 +20121,44 @@ pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_mul_round_ss( +pub unsafe fn _mm_mask_max_round_ss( src: __m128, k: __mmask8, a: __m128, b: __m128, - rounding: i32, + sae: i32, ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulss, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmulss( + vmaxss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19491,27 +20167,21 @@ pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { +pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmulsd( + vmaxsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19519,62 +20189,45 @@ pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d $imm4, ) }; - } - transmute(constify_imm4_round!(rounding, call)) -} - -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE + } + transmute(constify_imm4_sae!(sae, call)) +} + +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_mul_round_sd( +pub unsafe fn _mm_mask_max_round_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, - rounding: i32, + sae: i32, ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmulsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_mul_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, - rounding: i32, -) -> __m128d { +pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmulsd( + vmaxsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19583,27 +20236,21 @@ pub unsafe fn _mm_maskz_mul_round_sd( ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vdivss( + vminss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19612,56 +20259,44 @@ pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_div_round_ss( +pub unsafe fn _mm_mask_min_round_ss( src: __m128, k: __mmask8, a: __m128, b: __m128, - rounding: i32, + sae: i32, ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivss, rounding = 8))] +#[cfg_attr(test, assert_instr(vminss, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { +pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vdivss( + vminss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19670,27 +20305,21 @@ pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { +pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vdivsd( + vminsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19699,61 +20328,44 @@ pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_div_round_sd( +pub unsafe fn _mm_mask_min_round_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, - rounding: i32, + sae: i32, ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vdivsd, rounding = 8))] +#[cfg_attr(test, assert_instr(vminsd, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_div_round_sd( - k: __mmask8, - a: __m128d, - b: __m128d, - rounding: i32, -) -> __m128d { +pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vdivsd( + vminsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19762,21 +20374,27 @@ pub unsafe fn _mm_maskz_div_round_sd( ) }; } - transmute(constify_imm4_round!(rounding, call)) + transmute(constify_imm4_sae!(sae, call)) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { +pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmaxss( + vsqrtss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19785,44 +20403,56 @@ pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { ) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_max_round_ss( +pub unsafe fn _mm_mask_sqrt_round_ss( src: __m128, k: __mmask8, a: __m128, b: __m128, - sae: i32, + rounding: i32, ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxss, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { +pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vmaxss( + vsqrtss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19831,21 +20461,27 @@ pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32 ) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { +pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmaxsd( + vsqrtsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19854,44 +20490,61 @@ pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { ) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_max_round_sd( +pub unsafe fn _mm_mask_sqrt_round_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, - sae: i32, + rounding: i32, ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vmaxsd, sae = 8))] +#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { +pub unsafe fn _mm_maskz_sqrt_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + rounding: i32, +) -> __m128d { macro_rules! call { ($imm4:expr) => { - vmaxsd( + vsqrtsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19900,21 +20553,21 @@ pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i ) }; } - transmute(constify_imm4_sae!(sae, call)) + transmute(constify_imm4_round!(rounding, call)) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { +pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vminss( + vgetexpss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19923,18 +20576,19 @@ pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 { ) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_min_round_ss( +pub unsafe fn _mm_mask_getexp_round_ss( src: __m128, k: __mmask8, a: __m128, @@ -19943,24 +20597,25 @@ pub unsafe fn _mm_mask_min_round_ss( ) -> __m128 { macro_rules! call { ($imm4:expr) => { - vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + vgetexpss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminss, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpss, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { +pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 { macro_rules! call { ($imm4:expr) => { - vminss( + vgetexpss( a.as_f32x4(), b.as_f32x4(), _mm_setzero_ps().as_f32x4(), @@ -19969,21 +20624,22 @@ pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32 ) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))] #[rustc_args_required_const(2)] -pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { +pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vminsd( + vgetexpsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -19992,18 +20648,19 @@ pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d { ) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))] #[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_min_round_sd( +pub unsafe fn _mm_mask_getexp_round_sd( src: __m128d, k: __mmask8, a: __m128d, @@ -20012,24 +20669,25 @@ pub unsafe fn _mm_mask_min_round_sd( ) -> __m128d { macro_rules! call { ($imm4:expr) => { - vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + vgetexpsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element. /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vminsd, sae = 8))] +#[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))] #[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { +pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d { macro_rules! call { ($imm4:expr) => { - vminsd( + vgetexpsd( a.as_f64x2(), b.as_f64x2(), _mm_setzero_pd().as_f64x2(), @@ -20038,186 +20696,254 @@ pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i ) }; } - transmute(constify_imm4_sae!(sae, call)) + let r = constify_imm4_sae!(sae, call); + transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(2, 3, 4)] +pub unsafe fn _mm_getmant_round_ss( + a: __m128, + b: __m128, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128 { macro_rules! call { - ($imm4:expr) => { - vsqrtss( + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantss( a.as_f32x4(), b.as_f32x4(), + $imm2 << 2 | $imm4_1, _mm_setzero_ps().as_f32x4(), 0b1, - $imm4, + $imm4_2, ) }; } - transmute(constify_imm4_round!(rounding, call)) + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_sqrt_round_ss( +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(4, 5, 6)] +pub unsafe fn _mm_mask_getmant_round_ss( src: __m128, k: __mmask8, a: __m128, b: __m128, - rounding: i32, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, ) -> __m128 { macro_rules! call { - ($imm4:expr) => { - vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantss( + a.as_f32x4(), + b.as_f32x4(), + $imm2 << 2 | $imm4_1, + src.as_f32x4(), + k, + $imm4_2, + ) }; } - transmute(constify_imm4_round!(rounding, call)) + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) } -/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))] -#[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 { +#[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(3, 4, 5)] +pub unsafe fn _mm_maskz_getmant_round_ss( + k: __mmask8, + a: __m128, + b: __m128, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128 { macro_rules! call { - ($imm4:expr) => { - vsqrtss( + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantss( a.as_f32x4(), b.as_f32x4(), + $imm2 << 2 | $imm4_1, _mm_setzero_ps().as_f32x4(), k, - $imm4, + $imm4_2, ) }; } - transmute(constify_imm4_round!(rounding, call)) + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] -#[rustc_args_required_const(2)] -pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(2, 3, 4)] +pub unsafe fn _mm_getmant_round_sd( + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128d { macro_rules! call { - ($imm4:expr) => { - vsqrtsd( + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( a.as_f64x2(), b.as_f64x2(), + $imm2 << 2 | $imm4_1, _mm_setzero_pd().as_f64x2(), 0b1, - $imm4, + $imm4_2, ) - }; - } - transmute(constify_imm4_round!(rounding, call)) -} - -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE + }; + } + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] -#[rustc_args_required_const(4)] -pub unsafe fn _mm_mask_sqrt_round_sd( +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(4, 5, 6)] +pub unsafe fn _mm_mask_getmant_round_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, - rounding: i32, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, ) -> __m128d { macro_rules! call { - ($imm4:expr) => { - vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + src.as_f64x2(), + k, + $imm4_2, + ) }; } - transmute(constify_imm4_round!(rounding, call)) + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) } -/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. -/// -/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: -/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions -/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions -/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions -/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions -/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))] -#[rustc_args_required_const(3)] -pub unsafe fn _mm_maskz_sqrt_round_sd( +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(3, 4, 5)] +pub unsafe fn _mm_maskz_getmant_round_sd( k: __mmask8, a: __m128d, b: __m128d, - rounding: i32, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, ) -> __m128d { macro_rules! call { - ($imm4:expr) => { - vsqrtsd( + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( a.as_f64x2(), b.as_f64x2(), + $imm2 << 2 | $imm4_1, _mm_setzero_pd().as_f64x2(), k, - $imm4, + $imm4_2, ) }; } - transmute(constify_imm4_round!(rounding, call)) + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) } /// Equal @@ -20851,6 +21577,14 @@ extern "C" { fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.sqrt.sd"] fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.getexp.ss"] + fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.getexp.sd"] + fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.getmant.ss"] + fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.getmant.sd"] + fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2; #[link_name = "llvm.x86.avx512.rsqrt14.ss"] fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4; @@ -30326,6 +31060,138 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getexp_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_getexp_ss(a, b); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getexp_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_mask_getexp_ss(a, 0, a, b); + let e = _mm_set_ps(2., 2., 2., 2.); + assert_eq_m128(r, e); + let r = _mm_mask_getexp_ss(a, 0b11111111, a, b); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getexp_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_maskz_getexp_ss(0, a, b); + let e = _mm_set_ps(2., 2., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_getexp_ss(0b11111111, a, b); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getexp_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_getexp_sd(a, b); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getexp_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_mask_getexp_sd(a, 0, a, b); + let e = _mm_set_pd(2., 2.); + assert_eq_m128d(r, e); + let r = _mm_mask_getexp_sd(a, 0b11111111, a, b); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getexp_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_maskz_getexp_sd(0, a, b); + let e = _mm_set_pd(2., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_getexp_sd(0b11111111, a, b); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getmant_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_getmant_ss(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getmant_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_mask_getmant_ss(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_ps(20., 20., 20., 20.); + assert_eq_m128(r, e); + let r = _mm_mask_getmant_ss(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getmant_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_maskz_getmant_ss(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_ps(20., 20., 20., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_getmant_ss(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getmant_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_getmant_sd(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getmant_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_mask_getmant_sd(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_pd(20., 20.); + assert_eq_m128d(r, e); + let r = _mm_mask_getmant_sd(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getmant_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_maskz_getmant_sd(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_pd(20., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_getmant_sd(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_add_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); @@ -30857,4 +31723,208 @@ mod tests { let e = _mm_set_pd(1., 2.); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getexp_round_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getexp_round_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2., 2., 2., 2.); + assert_eq_m128(r, e); + let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getexp_round_ss() { + let a = _mm_set1_ps(2.); + let b = _mm_set1_ps(3.); + let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2., 2., 2., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2., 2., 2., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getexp_round_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getexp_round_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2., 2.); + assert_eq_m128d(r, e); + let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getexp_round_sd() { + let a = _mm_set1_pd(2.); + let b = _mm_set1_pd(3.); + let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2., 1.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getmant_round_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_getmant_round_ss( + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getmant_round_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_mask_getmant_round_ss( + a, + 0, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_ps(20., 20., 20., 20.); + assert_eq_m128(r, e); + let r = _mm_mask_getmant_round_ss( + a, + 0b11111111, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getmant_round_ss() { + let a = _mm_set1_ps(20.); + let b = _mm_set1_ps(10.); + let r = _mm_maskz_getmant_round_ss( + 0, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_ps(20., 20., 20., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_getmant_round_ss( + 0b11111111, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_ps(20., 20., 20., 1.25); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_getmant_round_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_getmant_round_sd( + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_getmant_round_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_mask_getmant_round_sd( + a, + 0, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_pd(20., 20.); + assert_eq_m128d(r, e); + let r = _mm_mask_getmant_round_sd( + a, + 0b11111111, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_getmant_round_sd() { + let a = _mm_set1_pd(20.); + let b = _mm_set1_pd(10.); + let r = _mm_maskz_getmant_round_sd( + 0, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_pd(20., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_getmant_round_sd( + 0b11111111, + a, + b, + _MM_MANT_NORM_1_2, + _MM_MANT_SIGN_SRC, + _MM_FROUND_CUR_DIRECTION, + ); + let e = _mm_set_pd(20., 1.25); + assert_eq_m128d(r, e); + } } From 16d6dd60abb7f542b5f5d00a6541b47ee2268d10 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 20:24:22 +0000 Subject: [PATCH 22/25] roundscale: ss,sd; roundscale_round: ss,sd; scalef: ss,sd; scalef_round: ss,sd; --- crates/core_arch/avx512f.md | 48 +- crates/core_arch/src/x86/avx512f.rs | 1107 +++++++++++++++++++++++++-- 2 files changed, 1064 insertions(+), 91 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 5b2aaa1c21..052395b275 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1300,16 +1300,16 @@ * [x] [`_mm_mask_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ss&expand=5236) * [x] [`_mm_mask_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_sd&expand=5236) * [x] [`_mm_mask_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ss&expand=5236) - * [ ] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236) - * [ ] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236) - * [ ] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236) - * [ ] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236) + * [x] [`_mm_mask_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sd&expand=5236) + * [x] [`_mm_mask_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_ss&expand=5236) + * [x] [`_mm_mask_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sd&expand=5236) + * [x] [`_mm_mask_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ss&expand=5236) * [x] [`_mm_mask_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_sd&expand=5236) * [x] [`_mm_mask_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ss&expand=5236) - * [ ] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236) - * [ ] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236) - * [ ] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236) - * [ ] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236) + * [x] [`_mm_mask_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sd&expand=5236) + * [x] [`_mm_mask_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_ss&expand=5236) + * [x] [`_mm_mask_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sd&expand=5236) + * [x] [`_mm_mask_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ss&expand=5236) * [x] [`_mm_mask_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sd&expand=5236) * [x] [`_mm_mask_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_ss&expand=5236) * [x] [`_mm_mask_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sd&expand=5236) @@ -1378,16 +1378,16 @@ * [x] [`_mm_maskz_mul_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ss&expand=5236) * [x] [`_mm_maskz_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_sd&expand=5236) * [x] [`_mm_maskz_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ss&expand=5236) - * [ ] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236) - * [ ] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236) - * [ ] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236) - * [ ] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236) + * [x] [`_mm_maskz_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sd&expand=5236) + * [x] [`_mm_maskz_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_ss&expand=5236) + * [x] [`_mm_maskz_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sd&expand=5236) + * [x] [`_mm_maskz_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ss&expand=5236) * [x] [`_mm_maskz_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_sd&expand=5236) * [x] [`_mm_maskz_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ss&expand=5236) - * [ ] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236) - * [ ] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236) - * [ ] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236) - * [ ] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236) + * [x] [`_mm_maskz_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sd&expand=5236) + * [x] [`_mm_maskz_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_ss&expand=5236) + * [x] [`_mm_maskz_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sd&expand=5236) + * [x] [`_mm_maskz_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ss&expand=5236) * [x] [`_mm_maskz_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sd&expand=5236) * [x] [`_mm_maskz_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_ss&expand=5236) * [x] [`_mm_maskz_sqrt_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sd&expand=5236) @@ -1404,16 +1404,16 @@ * [x] [`_mm_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_ss&expand=5236) * [x] [`_mm_rcp14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_sd&expand=5236) * [x] [`_mm_rcp14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ss&expand=5236) - * [ ] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236) - * [ ] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236) - * [ ] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236) - * [ ] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236) + * [x] [`_mm_roundscale_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sd&expand=5236) + * [x] [`_mm_roundscale_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_ss&expand=5236) + * [x] [`_mm_roundscale_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sd&expand=5236) + * [x] [`_mm_roundscale_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ss&expand=5236) * [x] [`_mm_rsqrt14_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_sd&expand=5236) * [x] [`_mm_rsqrt14_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt14_ss&expand=5236) - * [ ] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236) - * [ ] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236) - * [ ] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236) - * [ ] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236) + * [x] [`_mm_scalef_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sd&expand=5236) + * [x] [`_mm_scalef_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_ss&expand=5236) + * [x] [`_mm_scalef_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sd&expand=5236) + * [x] [`_mm_scalef_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ss&expand=5236) * [x] [`_mm_sqrt_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sd&expand=5236) * [x] [`_mm_sqrt_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_ss&expand=5236) * [x] [`_mm_sub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 24b6245e17..a7e298c550 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -19385,6 +19385,294 @@ pub unsafe fn _mm_maskz_getmant_sd( transmute(r) } +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 255))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128, imm8: i32) -> __m128 { + macro_rules! call { + ($imm8:expr) => { + vrndscaless( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b11111111, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_roundscale_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + imm8: i32, +) -> __m128 { + macro_rules! call { + ($imm8:expr) => { + vrndscaless( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_roundscale_ss(k: __mmask8, a: __m128, b: __m128, imm8: i32) -> __m128 { + macro_rules! call { + ($imm8:expr) => { + vrndscaless( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 255))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { + macro_rules! call { + ($imm8:expr) => { + vrndscalesd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b11111111, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_roundscale_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + imm8: i32, +) -> __m128d { + macro_rules! call { + ($imm8:expr) => { + vrndscalesd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_roundscale_sd(k: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __m128d { + macro_rules! call { + ($imm8:expr) => { + vrndscalesd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm8, + _MM_FROUND_CUR_DIRECTION, + ) + }; + } + let r = constify_imm8_sae!(imm8, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 { + transmute(vscalefss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vscalefss( + a.as_f32x4(), + b.as_f32x4(), + src.as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss))] +pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + transmute(vscalefss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b11111111, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + src.as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefsd))] +pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + transmute(vscalefsd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + _MM_FROUND_CUR_DIRECTION, + )) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -20835,114 +21123,504 @@ pub unsafe fn _mm_maskz_getmant_round_ss( /// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889) +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(2, 3, 4)] +pub unsafe fn _mm_getmant_round_sd( + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + _mm_setzero_pd().as_f64x2(), + 0b1, + $imm4_2, + ) + }; + } + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(4, 5, 6)] +pub unsafe fn _mm_mask_getmant_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + src.as_f64x2(), + k, + $imm4_2, + ) + }; + } + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) +} + +/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. +/// The mantissa is normalized to the interval specified by interv, which can take the following values: +/// _MM_MANT_NORM_1_2 // interval [1, 2) +/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) +/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) +/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) +/// The sign is determined by sc which can take the following values: +/// _MM_MANT_SIGN_src // sign = sign(src) +/// _MM_MANT_SIGN_zero // sign = 0 +/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] +#[rustc_args_required_const(3, 4, 5)] +pub unsafe fn _mm_maskz_getmant_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + norm: _MM_MANTISSA_NORM_ENUM, + sign: _MM_MANTISSA_SIGN_ENUM, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { + vgetmantsd( + a.as_f64x2(), + b.as_f64x2(), + $imm2 << 2 | $imm4_1, + _mm_setzero_pd().as_f64x2(), + k, + $imm4_2, + ) + }; + } + let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + transmute(r) +} + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm_roundscale_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> __m128 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaless( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b11111111, + $imm8, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm_mask_roundscale_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + imm8: i32, + sae: i32, +) -> __m128 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaless(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm8, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm_maskz_roundscale_round_ss( + k: __mmask8, + a: __m128, + b: __m128, + imm8: i32, + sae: i32, +) -> __m128 { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscaless( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm8, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(2, 3)] +pub unsafe fn _mm_roundscale_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> __m128d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalesd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + 0b11111111, + $imm8, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(4, 5)] +pub unsafe fn _mm_mask_roundscale_round_sd( + src: __m128d, + k: __mmask8, + a: __m128d, + b: __m128d, + imm8: i32, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalesd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm8, $imm4) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// Rounding is done according to the imm8[2:0] parameter, which can be one of: +/// _MM_FROUND_TO_NEAREST_INT // round to nearest +/// _MM_FROUND_TO_NEG_INF // round down +/// _MM_FROUND_TO_POS_INF // round up +/// _MM_FROUND_TO_ZERO // truncate +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))] +#[rustc_args_required_const(3, 4)] +pub unsafe fn _mm_maskz_roundscale_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + imm8: i32, + sae: i32, +) -> __m128d { + macro_rules! call { + ($imm8:expr, $imm4:expr) => { + vrndscalesd( + a.as_f64x2(), + b.as_f64x2(), + _mm_setzero_pd().as_f64x2(), + k, + $imm8, + $imm4, + ) + }; + } + let r = constify_imm8_roundscale!(imm8, sae, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vscalefss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + 0b11111111, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_scalef_round_ss( + src: __m128, + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vscalefss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vscalefss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_scalef_round_ss( + k: __mmask8, + a: __m128, + b: __m128, + rounding: i32, +) -> __m128 { + macro_rules! call { + ($imm4:expr) => { + vscalefss( + a.as_f32x4(), + b.as_f32x4(), + _mm_setzero_ps().as_f32x4(), + k, + $imm4, + ) + }; + } + let r = constify_imm4_round!(rounding, call); + transmute(r) +} + +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] -#[rustc_args_required_const(2, 3, 4)] -pub unsafe fn _mm_getmant_round_sd( - a: __m128d, - b: __m128d, - norm: _MM_MANTISSA_NORM_ENUM, - sign: _MM_MANTISSA_SIGN_ENUM, - sae: i32, -) -> __m128d { +#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))] +#[rustc_args_required_const(2)] +pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { macro_rules! call { - ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { - vgetmantsd( + ($imm4:expr) => { + vscalefsd( a.as_f64x2(), b.as_f64x2(), - $imm2 << 2 | $imm4_1, _mm_setzero_pd().as_f64x2(), - 0b1, - $imm4_2, + 0b11111111, + $imm4, ) }; } - let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + let r = constify_imm4_round!(rounding, call); transmute(r) } -/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. -/// The mantissa is normalized to the interval specified by interv, which can take the following values: -/// _MM_MANT_NORM_1_2 // interval [1, 2) -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) -/// The sign is determined by sc which can take the following values: -/// _MM_MANT_SIGN_src // sign = sign(src) -/// _MM_MANT_SIGN_zero // sign = 0 -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] -#[rustc_args_required_const(4, 5, 6)] -pub unsafe fn _mm_mask_getmant_round_sd( +#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_scalef_round_sd( src: __m128d, k: __mmask8, a: __m128d, b: __m128d, - norm: _MM_MANTISSA_NORM_ENUM, - sign: _MM_MANTISSA_SIGN_ENUM, - sae: i32, + rounding: i32, ) -> __m128d { macro_rules! call { - ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { - vgetmantsd( - a.as_f64x2(), - b.as_f64x2(), - $imm2 << 2 | $imm4_1, - src.as_f64x2(), - k, - $imm4_2, - ) + ($imm4:expr) => { + vscalefsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4) }; } - let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + let r = constify_imm4_round!(rounding, call); transmute(r) } -/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. -/// The mantissa is normalized to the interval specified by interv, which can take the following values: -/// _MM_MANT_NORM_1_2 // interval [1, 2) -/// _MM_MANT_NORM_p5_2 // interval [0.5, 2) -/// _MM_MANT_NORM_p5_1 // interval [0.5, 1) -/// _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5) -/// The sign is determined by sc which can take the following values: -/// _MM_MANT_SIGN_src // sign = sign(src) -/// _MM_MANT_SIGN_zero // sign = 0 -/// _MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1 -/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. +/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. /// -/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891) +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891) #[inline] #[target_feature(enable = "avx512f")] -#[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))] -#[rustc_args_required_const(3, 4, 5)] -pub unsafe fn _mm_maskz_getmant_round_sd( +#[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_maskz_scalef_round_sd( k: __mmask8, a: __m128d, b: __m128d, - norm: _MM_MANTISSA_NORM_ENUM, - sign: _MM_MANTISSA_SIGN_ENUM, - sae: i32, + rounding: i32, ) -> __m128d { macro_rules! call { - ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => { - vgetmantsd( + ($imm4:expr) => { + vscalefsd( a.as_f64x2(), b.as_f64x2(), - $imm2 << 2 | $imm4_1, _mm_setzero_pd().as_f64x2(), k, - $imm4_2, + $imm4, ) }; } - let r = constify_imm4_mantissas_sae!(norm, sign, sae, call); + let r = constify_imm4_round!(rounding, call); transmute(r) } @@ -21594,6 +22272,15 @@ extern "C" { fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4; #[link_name = "llvm.x86.avx512.rcp14.sd"] fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2; + + #[link_name = "llvm.x86.avx512.mask.rndscale.ss"] + fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.rndscale.sd"] + fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2; + #[link_name = "llvm.x86.avx512.mask.scalef.ss"] + fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; + #[link_name = "llvm.x86.avx512.mask.scalef.sd"] + fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; } #[cfg(test)] @@ -31192,6 +31879,138 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_roundscale_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_roundscale_ss(a, b, 0); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_roundscale_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_mask_roundscale_ss(a, 0, a, b, 0); + let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2); + assert_eq_m128(r, e); + let r = _mm_mask_roundscale_ss(a, 0b11111111, a, b, 0); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_roundscale_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_maskz_roundscale_ss(0, a, b, 0); + let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0); + assert_eq_m128(r, e); + let r = _mm_maskz_roundscale_ss(0b11111111, a, b, 0); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_roundscale_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_roundscale_sd(a, b, 0); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_roundscale_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_mask_roundscale_sd(a, 0, a, b, 0); + let e = _mm_set_pd(2.2, 2.2); + assert_eq_m128d(r, e); + let r = _mm_mask_roundscale_sd(a, 0b11111111, a, b, 0); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_roundscale_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_maskz_roundscale_sd(0, a, b, 0); + let e = _mm_set_pd(2.2, 0.0); + assert_eq_m128d(r, e); + let r = _mm_maskz_roundscale_sd(0b11111111, a, b, 0); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_scalef_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_scalef_ss(a, b); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_scalef_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_mask_scalef_ss(a, 0, a, b); + let e = _mm_set_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); + let r = _mm_mask_scalef_ss(a, 0b11111111, a, b); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_scalef_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_maskz_scalef_ss(0, a, b); + let e = _mm_set_ps(1., 1., 1., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_scalef_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_scalef_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_scalef_sd(a, b); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_scalef_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_mask_scalef_sd(a, 0, a, b); + let e = _mm_set_pd(1., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_scalef_sd(a, 0b11111111, a, b); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_scalef_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_maskz_scalef_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_scalef_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_add_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); @@ -31927,4 +32746,158 @@ mod tests { let e = _mm_set_pd(20., 1.25); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_roundscale_round_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_roundscale_round_ss(a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_roundscale_round_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_mask_roundscale_round_ss(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2); + assert_eq_m128(r, e); + let r = _mm_mask_roundscale_round_ss(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_roundscale_round_ss() { + let a = _mm_set1_ps(2.2); + let b = _mm_set1_ps(1.1); + let r = _mm_maskz_roundscale_round_ss(0, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0); + assert_eq_m128(r, e); + let r = _mm_maskz_roundscale_round_ss(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_roundscale_round_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_roundscale_round_sd(a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_roundscale_round_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_mask_roundscale_round_sd(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2.2, 2.2); + assert_eq_m128d(r, e); + let r = _mm_mask_roundscale_round_sd(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_roundscale_round_sd() { + let a = _mm_set1_pd(2.2); + let b = _mm_set1_pd(1.1); + let r = _mm_maskz_roundscale_round_sd(0, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2.2, 0.0); + assert_eq_m128d(r, e); + let r = _mm_maskz_roundscale_round_sd(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION); + let e = _mm_set_pd(2.2, 1.0); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_scalef_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_scalef_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); + let r = _mm_mask_scalef_round_ss( + a, + 0b11111111, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_scalef_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(3.); + let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 1., 1., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_scalef_round_ss( + 0b11111111, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 1., 1., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_scalef_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_scalef_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 1.); + assert_eq_m128d(r, e); + let r = _mm_mask_scalef_round_sd( + a, + 0b11111111, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_scalef_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(3.); + let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_scalef_round_sd( + 0b11111111, + a, + b, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 8.); + assert_eq_m128d(r, e); + } } From 8052b6430474af564f9ee40e25f58ca9384e8614 Mon Sep 17 00:00:00 2001 From: jironglin Date: Thu, 15 Oct 2020 23:58:17 +0000 Subject: [PATCH 23/25] mask_move: ss,sd --- crates/core_arch/avx512f.md | 8 +- crates/core_arch/src/x86/avx512f.rs | 112 ++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 4 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index 052395b275..ae452723b0 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1292,8 +1292,8 @@ * [x] [`_mm_mask_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_ss&expand=5236) * [x] [`_mm_mask_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sd&expand=5236) * [x] [`_mm_mask_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ss&expand=5236) - * [ ] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236) - * [ ] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236) + * [x] [`_mm_mask_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sd&expand=5236) + * [x] [`_mm_mask_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_ss&expand=5236) * [x] [`_mm_mask_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sd&expand=5236) * [x] [`_mm_mask_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_ss&expand=5236) * [x] [`_mm_mask_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sd&expand=5236) @@ -1370,8 +1370,8 @@ * [x] [`_mm_maskz_min_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_ss&expand=5236) * [x] [`_mm_maskz_min_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sd&expand=5236) * [x] [`_mm_maskz_min_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ss&expand=5236) - * [ ] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236) - * [ ] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236) + * [x] [`_mm_maskz_move_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sd&expand=5236) + * [x] [`_mm_maskz_move_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_ss&expand=5236) * [x] [`_mm_maskz_mul_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sd&expand=5236) * [x] [`_mm_maskz_mul_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_ss&expand=5236) * [x] [`_mm_maskz_mul_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index a7e298c550..270d8c5859 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -18417,6 +18417,68 @@ pub unsafe fn _mm512_set_pd( _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) } +/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovss))] +pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 { + let extractsrc: f32 = simd_extract(src, 0); + let mut mov: f32 = extractsrc; + if (k & 0b00000001) != 0 { + mov = simd_extract(b, 0); + } + let r = simd_insert(a, 0, mov); + transmute(r) +} + +/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovss))] +pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 { + let mut mov: f32 = 0.; + if (k & 0b00000001) != 0 { + mov = simd_extract(b, 0); + } + let r = simd_insert(a, 0, mov); + transmute(r) +} + +/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovsd))] +pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let extractsrc: f64 = simd_extract(src, 0); + let mut mov: f64 = extractsrc; + if (k & 0b00000001) != 0 { + mov = simd_extract(b, 0); + } + let r = simd_insert(a, 0, mov); + transmute(r) +} + +/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vmovsd))] +pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d { + let mut mov: f64 = 0.; + if (k & 0b00000001) != 0 { + mov = simd_extract(b, 0); + } + let r = simd_insert(a, 0, mov); + transmute(r) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. /// /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159) @@ -31265,6 +31327,56 @@ mod tests { assert_eq_m512i(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_move_ss() { + let src = _mm_set_ps(10., 11., 100., 110.); + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_mask_move_ss(src, 0, a, b); + let e = _mm_set_ps(1., 2., 10., 110.); + assert_eq_m128(r, e); + let r = _mm_mask_move_ss(src, 0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 40.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_move_ss() { + let a = _mm_set_ps(1., 2., 10., 20.); + let b = _mm_set_ps(3., 4., 30., 40.); + let r = _mm_maskz_move_ss(0, a, b); + let e = _mm_set_ps(1., 2., 10., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_move_ss(0b11111111, a, b); + let e = _mm_set_ps(1., 2., 10., 40.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_move_sd() { + let src = _mm_set_pd(10., 11.); + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_mask_move_sd(src, 0, a, b); + let e = _mm_set_pd(1., 11.); + assert_eq_m128d(r, e); + let r = _mm_mask_move_sd(src, 0b11111111, a, b); + let e = _mm_set_pd(1., 4.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_move_sd() { + let a = _mm_set_pd(1., 2.); + let b = _mm_set_pd(3., 4.); + let r = _mm_maskz_move_sd(0, a, b); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_move_sd(0b11111111, a, b); + let e = _mm_set_pd(1., 4.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_mask_add_ss() { let src = _mm_set_ps(10., 11., 100., 110.); From 577cbc4169fd5d136a05095f7efed0022d2a2d1e Mon Sep 17 00:00:00 2001 From: jironglin Date: Fri, 16 Oct 2020 17:13:37 +0000 Subject: [PATCH 24/25] mask_fmadd: ss,sd; fmadd_round: ss,sd; --- crates/core_arch/avx512f.md | 28 +- crates/core_arch/src/x86/avx512f.rs | 587 ++++++++++++++++++++++++++++ 2 files changed, 601 insertions(+), 14 deletions(-) diff --git a/crates/core_arch/avx512f.md b/crates/core_arch/avx512f.md index ae452723b0..75b8c725df 100644 --- a/crates/core_arch/avx512f.md +++ b/crates/core_arch/avx512f.md @@ -1206,8 +1206,8 @@ * [ ] [`_mm_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_round_ss&expand=5236) * [ ] [`_mm_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_sd&expand=5236) * [ ] [`_mm_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_ss&expand=5236) - * [ ] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236) - * [ ] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236) + * [x] [`_mm_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sd&expand=5236) + * [x] [`_mm_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_ss&expand=5236) * [ ] [`_mm_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sd&expand=5236) * [ ] [`_mm_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_ss&expand=5236) * [ ] [`_mm_fnmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sd&expand=5236) @@ -1222,10 +1222,10 @@ * [x] [`_mm_getmant_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_ss&expand=5236) * [x] [`_mm_getmant_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sd&expand=5236) * [x] [`_mm_getmant_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ss&expand=5236) - * [ ] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236) - * [ ] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236) - * [ ] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236) - * [ ] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236) + * [x] [`_mm_mask3_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sd&expand=5236) + * [x] [`_mm_mask3_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_ss&expand=5236) + * [x] [`_mm_mask3_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sd&expand=5236) + * [x] [`_mm_mask3_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ss&expand=5236) * [ ] [`_mm_mask3_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sd&expand=5236) * [ ] [`_mm_mask3_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_ss&expand=5236) * [ ] [`_mm_mask3_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sd&expand=5236) @@ -1258,10 +1258,10 @@ * [ ] [`_mm_mask_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_round_ss&expand=5236) * [ ] [`_mm_mask_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_sd&expand=5236) * [ ] [`_mm_mask_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_ss&expand=5236) - * [ ] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236) - * [ ] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236) - * [ ] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236) - * [ ] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236) + * [x] [`_mm_mask_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sd&expand=5236) + * [x] [`_mm_mask_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_ss&expand=5236) + * [x] [`_mm_mask_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sd&expand=5236) + * [x] [`_mm_mask_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ss&expand=5236) * [ ] [`_mm_mask_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sd&expand=5236) * [ ] [`_mm_mask_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_ss&expand=5236) * [ ] [`_mm_mask_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sd&expand=5236) @@ -1336,10 +1336,10 @@ * [ ] [`_mm_maskz_fixupimm_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_round_ss&expand=5236) * [ ] [`_mm_maskz_fixupimm_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_sd&expand=5236) * [ ] [`_mm_maskz_fixupimm_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_ss&expand=5236) - * [ ] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236) - * [ ] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236) - * [ ] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236) - * [ ] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236) + * [x] [`_mm_maskz_fmadd_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sd&expand=5236) + * [x] [`_mm_maskz_fmadd_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_ss&expand=5236) + * [x] [`_mm_maskz_fmadd_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sd&expand=5236) + * [x] [`_mm_maskz_fmadd_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ss&expand=5236) * [ ] [`_mm_maskz_fmsub_round_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sd&expand=5236) * [ ] [`_mm_maskz_fmsub_round_ss`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_ss&expand=5236) * [ ] [`_mm_maskz_fmsub_sd`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sd&expand=5236) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index 270d8c5859..d231c42b19 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -19735,6 +19735,110 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128 )) } +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 { + let mut fmadd: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 { + let mut fmadd: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss))] +pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 { + let mut fmadd: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d { + let mut fmadd: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d { + let mut fmadd: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd))] +pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d { + let mut fmadd: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. /// /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: @@ -21686,6 +21790,280 @@ pub unsafe fn _mm_maskz_scalef_round_sd( transmute(r) } +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132ss(extracta, extractb, extractc, $imm4) + }; + } + let fmadd = constify_imm4_round!(rounding, call); + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_fmadd_round_ss( + a: __m128, + k: __mmask8, + b: __m128, + c: __m128, + rounding: i32, +) -> __m128 { + let mut fmadd: f32 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132ss(fmadd, extractb, extractc, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_maskz_fmadd_round_ss( + k: __mmask8, + a: __m128, + b: __m128, + c: __m128, + rounding: i32, +) -> __m128 { + let mut fmadd: f32 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + let extractc: f32 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132ss(extracta, extractb, extractc, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask3_fmadd_round_ss( + a: __m128, + b: __m128, + c: __m128, + k: __mmask8, + rounding: i32, +) -> __m128 { + let mut fmadd: f32 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f32 = simd_extract(a, 0); + let extractb: f32 = simd_extract(b, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132ss(extracta, extractb, fmadd, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))] +#[rustc_args_required_const(3)] +pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132sd(extracta, extractb, extractc, $imm4) + }; + } + let fmadd = constify_imm4_round!(rounding, call); + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask_fmadd_round_sd( + a: __m128d, + k: __mmask8, + b: __m128d, + c: __m128d, + rounding: i32, +) -> __m128d { + let mut fmadd: f64 = simd_extract(a, 0); + if (k & 0b00000001) != 0 { + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132sd(fmadd, extractb, extractc, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_maskz_fmadd_round_sd( + k: __mmask8, + a: __m128d, + b: __m128d, + c: __m128d, + rounding: i32, +) -> __m128d { + let mut fmadd: f64 = 0.; + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + let extractc: f64 = simd_extract(c, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132sd(extracta, extractb, extractc, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(a, 0, fmadd); + transmute(r) +} + +/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst. +/// +/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of: +/// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions +/// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions +/// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions +/// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions +/// _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571) +#[inline] +#[target_feature(enable = "avx512f")] +#[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))] +#[rustc_args_required_const(4)] +pub unsafe fn _mm_mask3_fmadd_round_sd( + a: __m128d, + b: __m128d, + c: __m128d, + k: __mmask8, + rounding: i32, +) -> __m128d { + let mut fmadd: f64 = simd_extract(c, 0); + if (k & 0b00000001) != 0 { + let extracta: f64 = simd_extract(a, 0); + let extractb: f64 = simd_extract(b, 0); + macro_rules! call { + ($imm4:expr) => { + vfmadd132sd(extracta, extractb, fmadd, $imm4) + }; + } + fmadd = constify_imm4_round!(rounding, call); + } + let r = simd_insert(c, 0, fmadd); + transmute(r) +} + /// Equal pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00; /// Less-than @@ -22343,6 +22721,11 @@ extern "C" { fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4; #[link_name = "llvm.x86.avx512.mask.scalef.sd"] fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2; + + #[link_name = "llvm.x86.avx512.vfmadd.f32"] + fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32; + #[link_name = "llvm.x86.avx512.vfmadd.f64"] + fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64; } #[cfg(test)] @@ -32123,6 +32506,80 @@ mod tests { assert_eq_m128d(r, e); } + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_fmadd_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_mask_fmadd_ss(a, 0, b, c); + assert_eq_m128(r, a); + let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c); + let e = _mm_set_ps(1., 1., 1., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_fmadd_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_maskz_fmadd_ss(0, a, b, c); + let e = _mm_set_ps(1., 1., 1., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c); + let e = _mm_set_ps(1., 1., 1., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask3_fmadd_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_mask3_fmadd_ss(a, b, c, 0); + assert_eq_m128(r, c); + let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111); + let e = _mm_set_ps(3., 3., 3., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_fmadd_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_mask_fmadd_sd(a, 0, b, c); + assert_eq_m128d(r, a); + let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c); + let e = _mm_set_pd(1., 5.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_fmadd_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_maskz_fmadd_sd(0, a, b, c); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c); + let e = _mm_set_pd(1., 5.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask3_fmadd_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_mask3_fmadd_sd(a, b, c, 0); + assert_eq_m128d(r, c); + let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111); + let e = _mm_set_pd(3., 5.); + assert_eq_m128d(r, e); + } + #[simd_test(enable = "avx512f")] unsafe fn test_mm_add_round_ss() { let a = _mm_set_ps(1., 2., 10., 20.); @@ -33012,4 +33469,134 @@ mod tests { let e = _mm_set_pd(1., 8.); assert_eq_m128d(r, e); } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_fmadd_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 1., 1., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_fmadd_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m128(r, a); + let r = _mm_mask_fmadd_round_ss( + a, + 0b11111111, + b, + c, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 1., 1., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_fmadd_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_ps(1., 1., 1., 0.); + assert_eq_m128(r, e); + let r = _mm_maskz_fmadd_round_ss( + 0b11111111, + a, + b, + c, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(1., 1., 1., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask3_fmadd_round_ss() { + let a = _mm_set1_ps(1.); + let b = _mm_set1_ps(2.); + let c = _mm_set1_ps(3.); + let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m128(r, c); + let r = _mm_mask3_fmadd_round_ss( + a, + b, + c, + 0b11111111, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_ps(3., 3., 3., 5.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_fmadd_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 5.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask_fmadd_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m128d(r, a); + let r = _mm_mask_fmadd_round_sd( + a, + 0b11111111, + b, + c, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 5.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_maskz_fmadd_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + let e = _mm_set_pd(1., 0.); + assert_eq_m128d(r, e); + let r = _mm_maskz_fmadd_round_sd( + 0b11111111, + a, + b, + c, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(1., 5.); + assert_eq_m128d(r, e); + } + + #[simd_test(enable = "avx512f")] + unsafe fn test_mm_mask3_fmadd_round_sd() { + let a = _mm_set1_pd(1.); + let b = _mm_set1_pd(2.); + let c = _mm_set1_pd(3.); + let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + assert_eq_m128d(r, c); + let r = _mm_mask3_fmadd_round_sd( + a, + b, + c, + 0b11111111, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC, + ); + let e = _mm_set_pd(3., 5.); + assert_eq_m128d(r, e); + } } From 17659b26a6ea8debfec02536b42d05f4440d52e4 Mon Sep 17 00:00:00 2001 From: jironglin Date: Fri, 16 Oct 2020 17:42:48 +0000 Subject: [PATCH 25/25] fix duplicated comment, remove assert int2mask --- crates/core_arch/src/x86/avx512f.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs index d231c42b19..7cea13c48c 100644 --- a/crates/core_arch/src/x86/avx512f.rs +++ b/crates/core_arch/src/x86/avx512f.rs @@ -2632,8 +2632,6 @@ pub unsafe fn _mm512_maskz_ternarylogic_epi64( transmute(simd_select_bitmask(k, ternarylogic, zero)) } -/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. - /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign. /// The mantissa is normalized to the interval specified by interv, which can take the following values: /// _MM_MANT_NORM_1_2 // interval [1, 2) @@ -15910,7 +15908,6 @@ pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 { #[inline] #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 { - assert!(mask >= 0); let r: u16 = mask as u16; transmute(r) }