diff --git a/.travis.yml b/.travis.yml index c7a7bbc469..c91e4f34de 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,9 +21,13 @@ matrix: script: | cargo install rustfmt-nightly cargo fmt --all -- --write-mode=diff + - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 + script: | + cargo install clippy + cargo clippy --all -- -D clippy-pedantic allow_failures: - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 - + - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1 install: - if [ "$NO_ADD" == "" ]; then rustup target add $TARGET; fi diff --git a/examples/nbody.rs b/examples/nbody.rs index a9baa74ff7..0e5ea69437 100644 --- a/examples/nbody.rs +++ b/examples/nbody.rs @@ -1,12 +1,20 @@ +//! n-body benchmark from the [benchmarks game][bg]. +//! +//! [bg]: https://benchmarksgame.alioth.debian.org/u64q/nbody-description. +//! html#nbody + #![cfg_attr(feature = "strict", deny(warnings))] #![feature(cfg_target_feature)] #![feature(target_feature)] +#![cfg_attr(feature = "cargo-clippy", + allow(similar_names, missing_docs_in_private_items, + shadow_reuse, print_stdout))] extern crate stdsimd; use self::stdsimd::simd; use simd::f64x2; -const PI: f64 = 3.141592653589793; +const PI: f64 = std::f64::consts::PI; const SOLAR_MASS: f64 = 4.0 * PI * PI; const DAYS_PER_YEAR: f64 = 365.24; @@ -29,7 +37,7 @@ impl Frsqrt for f64x2 { f32x4::new(t.extract(0), t.extract(1), 0., 0.), ).as_f64x4() }; - f64x2::new(u.extract(0), u.extract(1)) + Self::new(u.extract(0), u.extract(1)) } #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"), target_feature = "neon"))] @@ -61,8 +69,8 @@ struct Body { impl Body { fn new( x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64 - ) -> Body { - Body { + ) -> Self { + Self { x: [x0, x1, x2], _fill: 0.0, v: [v0, v1, v2], @@ -103,8 +111,8 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) { i = 0; while i < N { - for m in 0..3 { - dx[m] = f64x2::new(r[i][m], r[i + 1][m]); + for (m, dx) in dx.iter_mut().enumerate() { + *dx = f64x2::new(r[i][m], r[i + 1][m]); } dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; @@ -144,11 +152,10 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 { e += bi.mass * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2]) / 2.0; - for j in i + 1..N_BODIES { - let bj = &bodies[j]; + for bj in bodies.iter().take(N_BODIES).skip(i + 1) { let mut dx = [0.0; 3]; - for k in 0..3 { - dx[k] = bi.x[k] - bj.x[k]; + for (k, dx) in dx.iter_mut().enumerate() { + *dx = bi.x[k] - bj.x[k]; } let mut distance = 0.0; for &d in &dx { @@ -210,7 +217,7 @@ fn main() { .nth(1) .expect("need one arg") .parse() - .unwrap(); + .expect("argument should be a usize"); offset_momentum(&mut bodies); println!("{:.9}", energy(&bodies)); diff --git a/examples/play.rs b/examples/play.rs index 26ce5dd812..3957ae2116 100644 --- a/examples/play.rs +++ b/examples/play.rs @@ -1,5 +1,10 @@ #![cfg_attr(feature = "strict", deny(warnings))] #![feature(target_feature)] +#![cfg_attr(feature = "cargo-clippy", + allow(similar_names, missing_docs_in_private_items, + cast_sign_loss, cast_possible_truncation, + cast_possible_wrap, option_unwrap_used, use_debug, + print_stdout))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod example { diff --git a/examples/types.rs b/examples/types.rs index dc2a74e4fa..8bc0bc4e17 100644 --- a/examples/types.rs +++ b/examples/types.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "strict", deny(warnings))] #![feature(target_feature)] +#![cfg_attr(feature = "cargo-clippy", + allow(missing_docs_in_private_items, result_unwrap_used, + option_unwrap_used, print_stdout, use_debug))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod example { diff --git a/examples/wat.rs b/examples/wat.rs index 076be129a1..5a70eed85f 100644 --- a/examples/wat.rs +++ b/examples/wat.rs @@ -1,5 +1,8 @@ #![cfg_attr(feature = "strict", deny(warnings))] #![feature(target_feature)] +#![cfg_attr(feature = "cargo-clippy", + allow(missing_docs_in_private_items, result_unwrap_used, + option_unwrap_used, print_stdout, use_debug))] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod example { diff --git a/src/lib.rs b/src/lib.rs index 5c1333dc2d..7d6cc0916a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ //! others at: //! //! * [i686](https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/) -//! * [x86_64](https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/) +//! * [`x86_64`](https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/) //! * [arm](https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/) //! * [aarch64](https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/) //! @@ -122,6 +122,12 @@ simd_ffi, target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new, stmt_expr_attributes)] #![cfg_attr(test, feature(proc_macro, test))] +#![cfg_attr(feature = "cargo-clippy", + allow(inline_always, too_many_arguments, cast_sign_loss, + cast_lossless, cast_possible_wrap, + cast_possible_truncation, cast_precision_loss, + shadow_reuse, cyclomatic_complexity, similar_names, + doc_markdown, many_single_char_names))] #[cfg(test)] extern crate stdsimd_test; diff --git a/src/macros.rs b/src/macros.rs index ab287a8000..3d4dbd798b 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,3 +1,5 @@ +//! Utility macros + macro_rules! define_ty { ($name:ident, $($elty:ident),+) => { #[repr(simd)] diff --git a/src/simd_llvm.rs b/src/simd_llvm.rs index ec59fa7877..66a1cc8768 100644 --- a/src/simd_llvm.rs +++ b/src/simd_llvm.rs @@ -1,3 +1,7 @@ +//! LLVM's simd platform intrinsics +//! +//! TODO: should use `link_llvm_intrinsic` instead: issue #112 + extern "platform-intrinsic" { pub fn simd_eq(x: T, y: T) -> U; pub fn simd_ne(x: T, y: T) -> U; diff --git a/src/v128.rs b/src/v128.rs index f5c425bc74..9677cf9ab9 100644 --- a/src/v128.rs +++ b/src/v128.rs @@ -1,3 +1,5 @@ +//! 128-bit wide vector types + use simd_llvm::*; define_ty! { f64x2, f64, f64 } diff --git a/src/v256.rs b/src/v256.rs index 33d2584f71..2687e18325 100644 --- a/src/v256.rs +++ b/src/v256.rs @@ -1,3 +1,5 @@ +//! 256-bit wide vector types + use simd_llvm::*; define_ty! { f64x4, f64, f64, f64, f64 } diff --git a/src/v512.rs b/src/v512.rs index 4973a7001e..e763ef8ffa 100644 --- a/src/v512.rs +++ b/src/v512.rs @@ -1,3 +1,5 @@ +//! 512-bit wide vector types + use simd_llvm::*; define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 } diff --git a/src/v64.rs b/src/v64.rs index fe7f59c2fd..9802fcf764 100644 --- a/src/v64.rs +++ b/src/v64.rs @@ -1,3 +1,5 @@ +//! 64-bit wide vector types + use simd_llvm::*; define_ty_doc! { diff --git a/src/x86/avx.rs b/src/x86/avx.rs index dcf055179e..905e01449d 100644 --- a/src/x86/avx.rs +++ b/src/x86/avx.rs @@ -1,3 +1,18 @@ +//! Advanced Vector Extensions (AVX) +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture +//! Programmer's Manual, Volume 3: General-Purpose and System +//! Instructions][amd64_ref]. +//! +//! [Wikipedia][wiki] provides a quick overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions + use std::mem; use std::ptr; @@ -113,7 +128,7 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 { } } } - match (imm8 >> 0) & 0x1 { + match imm8 & 0x1 { 0 => shuffle1!(0), _ => shuffle1!(1), } @@ -161,7 +176,7 @@ pub unsafe fn _mm256_shuffle_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 { } } } - match (imm8 >> 0) & 0x3 { + match imm8 & 0x3 { 0 => shuffle1!(0, 4), 1 => shuffle1!(1, 5), 2 => shuffle1!(2, 6), @@ -594,69 +609,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 { mem::transmute(a ^ b) } -// Equal (ordered, non-signaling) +/// Equal (ordered, non-signaling) pub const _CMP_EQ_OQ: u8 = 0x00; -// Less-than (ordered, signaling) +/// Less-than (ordered, signaling) pub const _CMP_LT_OS: u8 = 0x01; -// Less-than-or-equal (ordered, signaling) +/// Less-than-or-equal (ordered, signaling) pub const _CMP_LE_OS: u8 = 0x02; -// Unordered (non-signaling) +/// Unordered (non-signaling) pub const _CMP_UNORD_Q: u8 = 0x03; -// Not-equal (unordered, non-signaling) +/// Not-equal (unordered, non-signaling) pub const _CMP_NEQ_UQ: u8 = 0x04; -// Not-less-than (unordered, signaling) +/// Not-less-than (unordered, signaling) pub const _CMP_NLT_US: u8 = 0x05; -// Not-less-than-or-equal (unordered, signaling) +/// Not-less-than-or-equal (unordered, signaling) pub const _CMP_NLE_US: u8 = 0x06; -// Ordered (non-signaling) +/// Ordered (non-signaling) pub const _CMP_ORD_Q: u8 = 0x07; -// Equal (unordered, non-signaling) +/// Equal (unordered, non-signaling) pub const _CMP_EQ_UQ: u8 = 0x08; -// Not-greater-than-or-equal (unordered, signaling) +/// Not-greater-than-or-equal (unordered, signaling) pub const _CMP_NGE_US: u8 = 0x09; -// Not-greater-than (unordered, signaling) +/// Not-greater-than (unordered, signaling) pub const _CMP_NGT_US: u8 = 0x0a; -// False (ordered, non-signaling) +/// False (ordered, non-signaling) pub const _CMP_FALSE_OQ: u8 = 0x0b; -// Not-equal (ordered, non-signaling) +/// Not-equal (ordered, non-signaling) pub const _CMP_NEQ_OQ: u8 = 0x0c; -// Greater-than-or-equal (ordered, signaling) +/// Greater-than-or-equal (ordered, signaling) pub const _CMP_GE_OS: u8 = 0x0d; -// Greater-than (ordered, signaling) +/// Greater-than (ordered, signaling) pub const _CMP_GT_OS: u8 = 0x0e; -// True (unordered, non-signaling) +/// True (unordered, non-signaling) pub const _CMP_TRUE_UQ: u8 = 0x0f; -// Equal (ordered, signaling) +/// Equal (ordered, signaling) pub const _CMP_EQ_OS: u8 = 0x10; -// Less-than (ordered, non-signaling) +/// Less-than (ordered, non-signaling) pub const _CMP_LT_OQ: u8 = 0x11; -// Less-than-or-equal (ordered, non-signaling) +/// Less-than-or-equal (ordered, non-signaling) pub const _CMP_LE_OQ: u8 = 0x12; -// Unordered (signaling) +/// Unordered (signaling) pub const _CMP_UNORD_S: u8 = 0x13; -// Not-equal (unordered, signaling) +/// Not-equal (unordered, signaling) pub const _CMP_NEQ_US: u8 = 0x14; -// Not-less-than (unordered, non-signaling) +/// Not-less-than (unordered, non-signaling) pub const _CMP_NLT_UQ: u8 = 0x15; -// Not-less-than-or-equal (unordered, non-signaling) +/// Not-less-than-or-equal (unordered, non-signaling) pub const _CMP_NLE_UQ: u8 = 0x16; -// Ordered (signaling) +/// Ordered (signaling) pub const _CMP_ORD_S: u8 = 0x17; -// Equal (unordered, signaling) +/// Equal (unordered, signaling) pub const _CMP_EQ_US: u8 = 0x18; -// Not-greater-than-or-equal (unordered, non-signaling) +/// Not-greater-than-or-equal (unordered, non-signaling) pub const _CMP_NGE_UQ: u8 = 0x19; -// Not-greater-than (unordered, non-signaling) +/// Not-greater-than (unordered, non-signaling) pub const _CMP_NGT_UQ: u8 = 0x1a; -// False (ordered, signaling) +/// False (ordered, signaling) pub const _CMP_FALSE_OS: u8 = 0x1b; -// Not-equal (ordered, signaling) +/// Not-equal (ordered, signaling) pub const _CMP_NEQ_OS: u8 = 0x1c; -// Greater-than-or-equal (ordered, non-signaling) +/// Greater-than-or-equal (ordered, non-signaling) pub const _CMP_GE_OQ: u8 = 0x1d; -// Greater-than (ordered, non-signaling) +/// Greater-than (ordered, non-signaling) pub const _CMP_GT_OQ: u8 = 0x1e; -// True (unordered, signaling) +/// True (unordered, signaling) pub const _CMP_TRUE_US: u8 = 0x1f; /// Compare packed double-precision (64-bit) floating-point @@ -920,13 +935,10 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 { #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { let imm8 = (imm8 & 0xFF) as u8; - const fn add4(x: u32) -> u32 { - x + 4 - } macro_rules! shuffle4 { ($a:expr, $b:expr, $c:expr, $d:expr) => { simd_shuffle8(a, _mm256_undefined_ps(), [ - $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d) + $a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4 ]) } } @@ -960,7 +972,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 { } } } - match (imm8 >> 0) & 0b11 { + match imm8 & 0b11 { 0b00 => shuffle1!(0), 0b01 => shuffle1!(1), 0b10 => shuffle1!(2), @@ -1014,7 +1026,7 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 { } } } - match (imm8 >> 0) & 0b11 { + match imm8 & 0b11 { 0b00 => shuffle1!(0), 0b01 => shuffle1!(1), 0b10 => shuffle1!(2), @@ -1022,6 +1034,8 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 { } } +/// Shuffle double-precision (64-bit) floating-point elements in `a` +/// within 256-bit lanes using the control in `b`. #[inline(always)] #[target_feature = "+avx"] #[cfg_attr(test, assert_instr(vpermilpd))] @@ -1074,7 +1088,7 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 { } } } - match (imm8 >> 0) & 0x1 { + match imm8 & 0x1 { 0 => shuffle1!(0), _ => shuffle1!(1), } @@ -1102,7 +1116,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 { } } } - match (imm8 >> 0) & 0x1 { + match imm8 & 0x1 { 0 => shuffle1!(0), _ => shuffle1!(1), } @@ -2750,8 +2764,7 @@ mod tests { let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.); let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.); let r = avx::_mm256_dp_ps(a, b, 0xFF); - let e = - f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.); + let e = f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.); assert_eq!(r, e); } diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index d8671b3f57..58b04eeb45 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -1,6 +1,27 @@ +//! Advanced Vector Extensions 2 (AVX) +//! +//! AVX2 expands most AVX commands to 256-bit wide vector registers and +//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate). +//! +//! The references are: +//! +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: +//! Instruction Set Reference, A-Z][intel64_ref]. +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and +//! System Instructions][amd64_ref]. +//! +//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick +//! overview of the instructions available. +//! +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf +//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions +//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate + use simd_llvm::simd_cast; use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8}; use simd_llvm::{simd_shuffle16, simd_shuffle32}; + use v256::*; use v128::*; use x86::__m256i; @@ -116,28 +137,25 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 { (a, b, n) }; - const fn add(a: u32, b: u32) -> u32 { - a + b - } macro_rules! shuffle { ($shift:expr) => { simd_shuffle32(b, a, [ - add(0, $shift), add(1, $shift), - add(2, $shift), add(3, $shift), - add(4, $shift), add(5, $shift), - add(6, $shift), add(7, $shift), - add(8, $shift), add(9, $shift), - add(10, $shift), add(11, $shift), - add(12, $shift), add(13, $shift), - add(14, $shift), add(15, $shift), - add(16, $shift), add(17, $shift), - add(18, $shift), add(19, $shift), - add(20, $shift), add(21, $shift), - add(22, $shift), add(23, $shift), - add(24, $shift), add(25, $shift), - add(26, $shift), add(27, $shift), - add(28, $shift), add(29, $shift), - add(30, $shift), add(31, $shift), + 0 + $shift, 1 + $shift, + 2 + $shift, 3 + $shift, + 4 + $shift, 5 + $shift, + 6 + $shift, 7 + $shift, + 8 + $shift, 9 + $shift, + 10 + $shift, 11 + $shift, + 12 + $shift, 13 + $shift, + 14 + $shift, 15 + $shift, + 16 + $shift, 17 + $shift, + 18 + $shift, 19 + $shift, + 20 + $shift, 21 + $shift, + 22 + $shift, 23 + $shift, + 24 + $shift, 25 + $shift, + 26 + $shift, 27 + $shift, + 28 + $shift, 29 + $shift, + 30 + $shift, 31 + $shift, ]) } } @@ -340,7 +358,7 @@ pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpbroadcastb))] pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 { - simd_shuffle16(a, i8x16::splat(0i8), [0u32; 16]) + simd_shuffle16(a, i8x16::splat(0_i8), [0_u32; 16]) } /// Broadcast the low packed 8-bit integer from `a` to all elements of @@ -349,7 +367,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpbroadcastb))] pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 { - simd_shuffle32(a, i8x16::splat(0i8), [0u32; 32]) + simd_shuffle32(a, i8x16::splat(0_i8), [0_u32; 32]) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -360,7 +378,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 { - simd_shuffle4(a, i32x4::splat(0i32), [0u32; 4]) + simd_shuffle4(a, i32x4::splat(0_i32), [0_u32; 4]) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -371,7 +389,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 { - simd_shuffle8(a, i32x4::splat(0i32), [0u32; 8]) + simd_shuffle8(a, i32x4::splat(0_i32), [0_u32; 8]) } /// Broadcast the low packed 64-bit integer from `a` to all elements of @@ -380,7 +398,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpbroadcastq))] pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 { - simd_shuffle2(a, i64x2::splat(0i64), [0u32; 2]) + simd_shuffle2(a, i64x2::splat(0_i64), [0_u32; 2]) } // NB: simd_shuffle4 with integer data types for `a` and `b` is @@ -391,7 +409,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastsd))] pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 { - simd_shuffle4(a, i64x2::splat(0i64), [0u32; 4]) + simd_shuffle4(a, i64x2::splat(0_i64), [0_u32; 4]) } /// Broadcast the low double-precision (64-bit) floating-point element @@ -400,7 +418,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vmovddup))] pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 { - simd_shuffle2(a, f64x2::splat(0f64), [0u32; 2]) + simd_shuffle2(a, f64x2::splat(0_f64), [0_u32; 2]) } /// Broadcast the low double-precision (64-bit) floating-point element @@ -409,7 +427,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastsd))] pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 { - simd_shuffle4(a, f64x2::splat(0f64), [0u32; 4]) + simd_shuffle4(a, f64x2::splat(0_f64), [0_u32; 4]) } // NB: broadcastsi128_si256 is often compiled to vinsertf128 or @@ -419,7 +437,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 { #[inline(always)] #[target_feature = "+avx2"] pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 { - simd_shuffle4(a, i64x2::splat(0i64), [0, 1, 0, 1]) + simd_shuffle4(a, i64x2::splat(0_i64), [0, 1, 0, 1]) } /// Broadcast the low single-precision (32-bit) floating-point element @@ -428,7 +446,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 { - simd_shuffle4(a, f32x4::splat(0f32), [0u32; 4]) + simd_shuffle4(a, f32x4::splat(0_f32), [0_u32; 4]) } /// Broadcast the low single-precision (32-bit) floating-point element @@ -437,7 +455,7 @@ pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vbroadcastss))] pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 { - simd_shuffle8(a, f32x4::splat(0f32), [0u32; 8]) + simd_shuffle8(a, f32x4::splat(0_f32), [0_u32; 8]) } /// Broadcast the low packed 16-bit integer from a to all elements of @@ -446,7 +464,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpbroadcastw))] pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 { - simd_shuffle8(a, i16x8::splat(0i16), [0u32; 8]) + simd_shuffle8(a, i16x8::splat(0_i16), [0_u32; 8]) } /// Broadcast the low packed 16-bit integer from a to all elements of @@ -455,7 +473,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 { #[target_feature = "+avx2"] #[cfg_attr(test, assert_instr(vpbroadcastw))] pub unsafe fn _mm256_broadcastw_epi16(a: i16x8) -> i16x16 { - simd_shuffle16(a, i16x8::splat(0i16), [0u32; 16]) + simd_shuffle16(a, i16x8::splat(0_i16), [0_u32; 16]) } // TODO _mm256_bslli_epi128 @@ -565,8 +583,8 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: i8x16) -> i32x8 { simd_cast::<::v64::i8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])) } -// An i8x4 type is pretty useless, but we need it as an intermediate type in -// _mm256_cvtepi8_epi64. +/// An i8x4 type is pretty useless, but we need it as an intermediate type in +/// _mm256_cvtepi8_epi64. #[repr(simd)] #[allow(non_camel_case_types)] struct i8x4(i8, i8, i8, i8); diff --git a/src/x86/bmi.rs b/src/x86/bmi.rs index 2cc86d8009..8809f63854 100644 --- a/src/x86/bmi.rs +++ b/src/x86/bmi.rs @@ -21,7 +21,7 @@ use stdsimd_test::assert_instr; #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(bextr))] pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { - _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32)) + _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32)) } /// Extracts bits in range [`start`, `start` + `length`) from `a` into @@ -31,7 +31,7 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 { #[cfg_attr(test, assert_instr(bextr))] #[cfg(not(target_arch = "x86"))] pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 { - _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64)) + _bextr2_u64(a, (start & 0xff_u64) | ((len & 0xff_u64) << 8_u64)) } /// Extracts bits of `a` specified by `control` into @@ -97,7 +97,7 @@ pub unsafe fn _blsi_u64(x: u64) -> u64 { #[target_feature = "+bmi"] #[cfg_attr(test, assert_instr(blsmsk))] pub unsafe fn _blsmsk_u32(x: u32) -> u32 { - x ^ (x.wrapping_sub(1u32)) + x ^ (x.wrapping_sub(1_u32)) } /// Get mask up to lowest set bit. @@ -106,7 +106,7 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 { #[cfg_attr(test, assert_instr(blsmsk))] #[cfg(not(target_arch = "x86"))] // generates lots of instructions pub unsafe fn _blsmsk_u64(x: u64) -> u64 { - x ^ (x.wrapping_sub(1u64)) + x ^ (x.wrapping_sub(1_u64)) } /// Resets the lowest set bit of `x`. diff --git a/src/x86/macros.rs b/src/x86/macros.rs index e835fe2107..f268a3499b 100644 --- a/src/x86/macros.rs +++ b/src/x86/macros.rs @@ -1,3 +1,5 @@ +//! Utility macros. + macro_rules! constify_imm8 { ($imm8:expr, $expand:ident) => { #[allow(overflowing_literals)] diff --git a/src/x86/mod.rs b/src/x86/mod.rs index a046c453c7..ba84f9d890 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -1,3 +1,5 @@ +//! `x86` and `x86_64` intrinsics. + pub use self::sse::*; pub use self::sse2::*; pub use self::sse3::*; @@ -14,8 +16,10 @@ pub use self::tbm::*; pub use self::runtime::{__unstable_detect_feature, __Feature}; +/// 128-bit wide signed integer vector type #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; +/// 256-bit wide signed integer vector type #[allow(non_camel_case_types)] pub type __m256i = ::v256::i8x32; diff --git a/src/x86/runtime.rs b/src/x86/runtime.rs index ef78d0a317..4b7e3aa56e 100644 --- a/src/x86/runtime.rs +++ b/src/x86/runtime.rs @@ -131,21 +131,18 @@ pub enum __Feature { #[doc(hidden)] __NonExhaustive, } +/// Sets the `bit`-th bit of `x`. fn set_bit(x: usize, bit: u32) -> usize { debug_assert!(32 > bit); x | 1 << bit } +/// Tests the `bit`-th bit of `x`. fn test_bit(x: usize, bit: u32) -> bool { debug_assert!(32 > bit); x & (1 << bit) != 0 } -fn inv_test_bit(v: usize, idx: u32) -> bool { - debug_assert!(32 > idx); - ((v >> idx) & 1) != 0 -} - /// Run-time feature detection on x86 works by using the CPUID instruction. /// /// The [CPUID Wikipedia page][wiki_cpuid] contains @@ -174,7 +171,7 @@ fn detect_features() -> usize { /// below). asm!("cpuid" : "={ecx}"(proc_info_ecx), "={edx}"(proc_info_edx) - : "{eax}"(0x00000001u32), "{ecx}"(0 as u32) + : "{eax}"(0x0000_0001_u32), "{ecx}"(0 as u32) : :); /// 2. EAX=7, ECX=0: Queries "Extended Features" @@ -182,48 +179,48 @@ fn detect_features() -> usize { /// (see below); the result in ECX is not currently needed. asm!("cpuid" : "={ebx}"(extended_features_ebx) - : "{eax}"(0x00000007u32), "{ecx}"(0 as u32) + : "{eax}"(0x0000_0007_u32), "{ecx}"(0 as u32) : :); } let mut value: usize = 0; - if inv_test_bit(extended_features_ebx, 3) { + if test_bit(extended_features_ebx, 3) { value = set_bit(value, __Feature::bmi as u32); } - if inv_test_bit(extended_features_ebx, 8) { + if test_bit(extended_features_ebx, 8) { value = set_bit(value, __Feature::bmi2 as u32); } - if inv_test_bit(proc_info_ecx, 0) { + if test_bit(proc_info_ecx, 0) { value = set_bit(value, __Feature::sse3 as u32); } - if inv_test_bit(proc_info_ecx, 5) { + if test_bit(proc_info_ecx, 5) { value = set_bit(value, __Feature::abm as u32); } - if inv_test_bit(proc_info_ecx, 9) { + if test_bit(proc_info_ecx, 9) { value = set_bit(value, __Feature::ssse3 as u32); } - if inv_test_bit(proc_info_ecx, 12) { + if test_bit(proc_info_ecx, 12) { value = set_bit(value, __Feature::fma as u32); } - if inv_test_bit(proc_info_ecx, 19) { + if test_bit(proc_info_ecx, 19) { value = set_bit(value, __Feature::sse4_1 as u32); } - if inv_test_bit(proc_info_ecx, 20) { + if test_bit(proc_info_ecx, 20) { value = set_bit(value, __Feature::sse4_2 as u32); } - if inv_test_bit(proc_info_ecx, 21) { + if test_bit(proc_info_ecx, 21) { value = set_bit(value, __Feature::tbm as u32); } - if inv_test_bit(proc_info_ecx, 23) { + if test_bit(proc_info_ecx, 23) { value = set_bit(value, __Feature::popcnt as u32); } - if inv_test_bit(proc_info_edx, 25) { + if test_bit(proc_info_edx, 25) { value = set_bit(value, __Feature::sse as u32); } - if inv_test_bit(proc_info_edx, 26) { + if test_bit(proc_info_edx, 26) { value = set_bit(value, __Feature::sse2 as u32); } @@ -235,7 +232,9 @@ fn detect_features() -> usize { // - https://hg.mozilla. // org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190 // - if inv_test_bit(proc_info_ecx, 26) && inv_test_bit(proc_info_ecx, 27) { + if test_bit(proc_info_ecx, 26) && test_bit(proc_info_ecx, 27) { + /// XGETBV: reads the contents of the extended control + /// register (XCR). unsafe fn xgetbv(xcr_no: u32) -> u64 { let eax: u32; let edx: u32; @@ -249,10 +248,10 @@ fn detect_features() -> usize { // This is safe because on x86 `xgetbv` is always available. if unsafe { xgetbv(0) } & 6 == 6 { - if inv_test_bit(proc_info_ecx, 28) { + if test_bit(proc_info_ecx, 28) { value = set_bit(value, __Feature::avx as u32); } - if inv_test_bit(extended_features_ebx, 5) { + if test_bit(extended_features_ebx, 5) { value = set_bit(value, __Feature::avx2 as u32); } } diff --git a/src/x86/sse.rs b/src/x86/sse.rs index 5633d39c0e..49e10efdd1 100644 --- a/src/x86/sse.rs +++ b/src/x86/sse.rs @@ -1,3 +1,5 @@ +//! Streaming SIMD Extensions (SSE) + use simd_llvm::simd_shuffle4; use v128::*; use v64::f32x2; @@ -1136,7 +1138,7 @@ pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) { // is just silly let a64: u64x2 = mem::transmute(a); let a_hi = a64.extract(1); - *p = mem::transmute(a_hi); + *p = a_hi; } else { // target_arch = "x86_64" // If this is a `u64x2` LLVM generates a pshufd + movq, but we really @@ -1167,7 +1169,7 @@ pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) { // stack. let a64: u64x2 = mem::transmute(a); let a_hi = a64.extract(0); - *p = mem::transmute(a_hi); + *p = a_hi; } else { // target_arch = "x86_64" let a64: f64x2 = mem::transmute(a); @@ -1306,7 +1308,7 @@ pub unsafe fn _mm_sfence() { #[target_feature = "+sse"] #[cfg_attr(test, assert_instr(stmxcsr))] pub unsafe fn _mm_getcsr() -> u32 { - let mut result = 0i32; + let mut result = 0_i32; stmxcsr((&mut result) as *mut _ as *mut i8); result as u32 } @@ -1455,6 +1457,7 @@ pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; +/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) pub const _MM_EXCEPT_MASK: u32 = 0x003f; /// See [`_mm_setcsr`](fn._mm_setcsr.html) @@ -1469,6 +1472,7 @@ pub const _MM_MASK_OVERFLOW: u32 = 0x0400; pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_MASK_INEXACT: u32 = 0x1000; +/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) pub const _MM_MASK_MASK: u32 = 0x1f80; /// See [`_mm_setcsr`](fn._mm_setcsr.html) @@ -1479,14 +1483,18 @@ pub const _MM_ROUND_DOWN: u32 = 0x2000; pub const _MM_ROUND_UP: u32 = 0x4000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; + +/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) pub const _MM_ROUND_MASK: u32 = 0x6000; +/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; /// See [`_mm_setcsr`](fn._mm_setcsr.html) pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1494,6 +1502,7 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { _mm_getcsr() & _MM_MASK_MASK } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1501,6 +1510,7 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { _mm_getcsr() & _MM_EXCEPT_MASK } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1508,6 +1518,7 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { _mm_getcsr() & _MM_FLUSH_ZERO_MASK } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1515,6 +1526,7 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { _mm_getcsr() & _MM_ROUND_MASK } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1522,6 +1534,7 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1529,6 +1542,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] @@ -1538,6 +1552,7 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { _mm_setcsr(val) } +/// See [`_mm_setcsr`](fn._mm_setcsr.html) #[inline(always)] #[allow(non_snake_case)] #[target_feature = "+sse"] diff --git a/src/x86/sse2.rs b/src/x86/sse2.rs index c08fab33b6..264075ff43 100644 --- a/src/x86/sse2.rs +++ b/src/x86/sse2.rs @@ -1,3 +1,5 @@ +//! Streaming SIMD Extensions 2 (SSE2) + #[cfg(test)] use stdsimd_test::assert_instr; @@ -316,20 +318,17 @@ pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 { #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))] pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); - const fn sub(a: u32, b: u32) -> u32 { - a - b - } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16::<__m128i, __m128i>(zero, a, [ - sub(16, $shift), sub(17, $shift), - sub(18, $shift), sub(19, $shift), - sub(20, $shift), sub(21, $shift), - sub(22, $shift), sub(23, $shift), - sub(24, $shift), sub(25, $shift), - sub(26, $shift), sub(27, $shift), - sub(28, $shift), sub(29, $shift), - sub(30, $shift), sub(31, $shift), + 16 - $shift, 17 - $shift, + 18 - $shift, 19 - $shift, + 20 - $shift, 21 - $shift, + 22 - $shift, 23 - $shift, + 24 - $shift, 25 - $shift, + 26 - $shift, 27 - $shift, + 28 - $shift, 29 - $shift, + 30 - $shift, 31 - $shift, ]) } } @@ -463,20 +462,17 @@ pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 { #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))] pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i { let (zero, imm8) = (__m128i::splat(0), imm8 as u32); - const fn add(a: u32, b: u32) -> u32 { - a + b - } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16::<__m128i, __m128i>(a, zero, [ - add(0, $shift), add(1, $shift), - add(2, $shift), add(3, $shift), - add(4, $shift), add(5, $shift), - add(6, $shift), add(7, $shift), - add(8, $shift), add(9, $shift), - add(10, $shift), add(11, $shift), - add(12, $shift), add(13, $shift), - add(14, $shift), add(15, $shift), + 0 + $shift, 1 + $shift, + 2 + $shift, 3 + $shift, + 4 + $shift, 5 + $shift, + 6 + $shift, 7 + $shift, + 8 + $shift, 9 + $shift, + 10 + $shift, 11 + $shift, + 12 + $shift, 13 + $shift, + 14 + $shift, 15 + $shift, ]) } } @@ -1102,14 +1098,10 @@ pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 { pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 { // See _mm_shuffle_epi32. let imm8 = (imm8 & 0xFF) as u8; - const fn add4(x: u32) -> u32 { - x + 4 - } - macro_rules! shuffle_done { ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { simd_shuffle8(a, a, [ - 0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67), + 0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4, ]) } } @@ -1657,7 +1649,7 @@ pub unsafe fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comieqsd(a, b) as u8) + comieqsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for less-than. @@ -1665,7 +1657,7 @@ pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comiltsd(a, b) as u8) + comiltsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for less-than-or-equal. @@ -1673,7 +1665,7 @@ pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comilesd(a, b) as u8) + comilesd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for greater-than. @@ -1681,7 +1673,7 @@ pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comigtsd(a, b) as u8) + comigtsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for greater-than-or-equal. @@ -1689,7 +1681,7 @@ pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comigesd(a, b) as u8) + comigesd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for not-equal. @@ -1697,7 +1689,7 @@ pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(comisd))] pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(comineqsd(a, b) as u8) + comineqsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for equality. @@ -1705,7 +1697,7 @@ pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomieqsd(a, b) as u8) + ucomieqsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for less-than. @@ -1713,7 +1705,7 @@ pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomiltsd(a, b) as u8) + ucomiltsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for less-than-or-equal. @@ -1721,7 +1713,7 @@ pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomilesd(a, b) as u8) + ucomilesd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for greater-than. @@ -1729,7 +1721,7 @@ pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomigtsd(a, b) as u8) + ucomigtsd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for greater-than-or-equal. @@ -1737,7 +1729,7 @@ pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomigesd(a, b) as u8) + ucomigesd(a, b) as u8 != 0 } /// Compare the lower element of `a` and `b` for not-equal. @@ -1745,7 +1737,7 @@ pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool { #[target_feature = "+sse2"] #[cfg_attr(test, assert_instr(ucomisd))] pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool { - mem::transmute(ucomineqsd(a, b) as u8) + ucomineqsd(a, b) as u8 != 0 } /// Convert packed double-precision (64-bit) floating-point elements in "a" to @@ -1894,7 +1886,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> f64x2 { f64x2::new(a, b) } -/// returns packed double-precision (64-bit) floating-point elements with all +/// Returns packed double-precision (64-bit) floating-point elements with all /// zeros. #[inline(always)] #[target_feature = "+sse2"] @@ -1913,8 +1905,6 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 { movmskpd(a) } - - /// Load 128-bits (composed of 2 packed double-precision (64-bit) /// floating-point elements) from memory into the returned vector. /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection diff --git a/src/x86/sse3.rs b/src/x86/sse3.rs index cba3129d1b..29cd98194b 100644 --- a/src/x86/sse3.rs +++ b/src/x86/sse3.rs @@ -1,3 +1,5 @@ +//! Streaming SIMD Extensions 3 (SSE3) + use x86::__m128i; use simd_llvm::{simd_shuffle2, simd_shuffle4}; use v128::*; diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs index 90be986f8e..c0604b7a3e 100644 --- a/src/x86/sse41.rs +++ b/src/x86/sse41.rs @@ -1,3 +1,4 @@ +//! Streaming SIMD Extensions 4.1 (SSE4.1) use std::mem; @@ -22,18 +23,28 @@ pub const _MM_FROUND_RAISE_EXC: i32 = 0x00; /// suppress exceptions pub const _MM_FROUND_NO_EXC: i32 = 0x08; /// round to nearest and do not suppress exceptions -pub const _MM_FROUND_NINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT); +pub const _MM_FROUND_NINT: i32 = 0x00; /// round down and do not suppress exceptions -pub const _MM_FROUND_FLOOR: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); +pub const _MM_FROUND_FLOOR: i32 = + (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF); /// round up and do not suppress exceptions -pub const _MM_FROUND_CEIL: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); +pub const _MM_FROUND_CEIL: i32 = + (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF); /// truncate and do not suppress exceptions pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO); -/// use MXCSR.RC and do not suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE` -pub const _MM_FROUND_RINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); +/// use MXCSR.RC and do not suppress exceptions; see +/// `vendor::_MM_SET_ROUNDING_MODE` +pub const _MM_FROUND_RINT: i32 = + (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION); /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE` -pub const _MM_FROUND_NEARBYINT: i32 = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); +pub const _MM_FROUND_NEARBYINT: i32 = + (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION); +/// Blend packed 8-bit integers from `a` and `b` using `mask` +/// +/// The high bit of each corresponding mask byte determines the selection. +/// If the high bit is set the element of `a` is selected. The element +/// of `b` is selected otherwise. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendvb))] @@ -41,6 +52,11 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 { pblendvb(a, b, mask) } +/// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`. +/// +/// The mask bits determine the selection. A clear bit selects the +/// corresponding element of `a`, and a set bit the corresponding +/// element of `b`. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))] @@ -209,8 +225,8 @@ pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 { pmaxuw(a, b) } -// Compare packed 32-bit integers in `a` and `b`, and return packed maximum -// values. +/// Compare packed 32-bit integers in `a` and `b`, and return packed maximum +/// values. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxsd, imm8 = 0))] @@ -218,8 +234,8 @@ pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 { pmaxsd(a, b) } -// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed -// maximum values. +/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed +/// maximum values. #[inline(always)] #[target_feature = "+sse4.1"] #[cfg_attr(test, assert_instr(pmaxud, imm8 = 0))] @@ -356,11 +372,17 @@ pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 { /// /// ``` /// use stdsimd::vendor; -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); // round down, and suppress exceptions -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); // round up, and suppress exceptions -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); // truncate, and suppress exceptions -/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` +/// +/// // round to nearest, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// // round down, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// // round up, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// // truncate, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: +/// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` #[inline(always)] #[target_feature = "+sse4.1"] @@ -379,11 +401,17 @@ pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 { /// /// ``` /// use stdsimd::vendor; -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); // round down, and suppress exceptions -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); // round up, and suppress exceptions -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); // truncate, and suppress exceptions -/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` +/// +/// // round to nearest, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// // round down, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// // round up, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// // truncate, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: +/// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` #[inline(always)] #[target_feature = "+sse4.1"] @@ -404,11 +432,17 @@ pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 { /// /// ``` /// use stdsimd::vendor; -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); // round down, and suppress exceptions -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); // round up, and suppress exceptions -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); // truncate, and suppress exceptions -/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` +/// +/// // round to nearest, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// // round down, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// // round up, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// // truncate, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: +/// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` #[inline(always)] #[target_feature = "+sse4.1"] @@ -429,11 +463,17 @@ pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 { /// /// ``` /// use stdsimd::vendor; -/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions -/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); // round down, and suppress exceptions -/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); // round up, and suppress exceptions -/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); // truncate, and suppress exceptions -/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` +/// +/// // round to nearest, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); +/// // round down, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC); +/// // round up, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC); +/// // truncate, and suppress exceptions: +/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC); +/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`: +/// vendor::_MM_FROUND_CUR_DIRECTION; /// ``` #[inline(always)] #[target_feature = "+sse4.1"] diff --git a/src/x86/sse42.rs b/src/x86/sse42.rs index eb551684c9..0a1be2e947 100644 --- a/src/x86/sse42.rs +++ b/src/x86/sse42.rs @@ -1,3 +1,7 @@ +//! Streaming SIMD Extensions 4.2 (SSE4.2) +//! +//! Extends SSE4.1 with STTNI (String and Text New Instructions). + #[cfg(test)] use stdsimd_test::assert_instr; @@ -5,42 +9,42 @@ use v128::*; use x86::__m128i; /// String contains unsigned 8-bit characters *(Default)* -pub const _SIDD_UBYTE_OPS: i8 = 0b00000000; +pub const _SIDD_UBYTE_OPS: i8 = 0b0000_0000; /// String contains unsigned 16-bit characters -pub const _SIDD_UWORD_OPS: i8 = 0b00000001; +pub const _SIDD_UWORD_OPS: i8 = 0b0000_0001; /// String contains signed 8-bit characters -pub const _SIDD_SBYTE_OPS: i8 = 0b00000010; +pub const _SIDD_SBYTE_OPS: i8 = 0b0000_0010; /// String contains unsigned 16-bit characters -pub const _SIDD_SWORD_OPS: i8 = 0b00000011; +pub const _SIDD_SWORD_OPS: i8 = 0b0000_0011; /// For each character in `a`, find if it is in `b` *(Default)* -pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b00000000; -/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <= -/// b[2]...` -pub const _SIDD_CMP_RANGES: i8 = 0b00000100; +pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b0000_0000; +/// For each character in `a`, determine if +/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...` +pub const _SIDD_CMP_RANGES: i8 = 0b0000_0100; /// The strings defined by `a` and `b` are equal -pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b00001000; +pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b0000_1000; /// Search for the defined substring in the target -pub const _SIDD_CMP_EQUAL_ORDERED: i8 = 0b00001100; +pub const _SIDD_CMP_EQUAL_ORDERED: i8 = 0b0000_1100; /// Do not negate results *(Default)* -pub const _SIDD_POSITIVE_POLARITY: i8 = 0b00000000; +pub const _SIDD_POSITIVE_POLARITY: i8 = 0b0000_0000; /// Negate results -pub const _SIDD_NEGATIVE_POLARITY: i8 = 0b00010000; +pub const _SIDD_NEGATIVE_POLARITY: i8 = 0b0001_0000; /// Do not negate results before the end of the string -pub const _SIDD_MASKED_POSITIVE_POLARITY: i8 = 0b00100000; +pub const _SIDD_MASKED_POSITIVE_POLARITY: i8 = 0b0010_0000; /// Negate results only before the end of the string -pub const _SIDD_MASKED_NEGATIVE_POLARITY: i8 = 0b00110000; +pub const _SIDD_MASKED_NEGATIVE_POLARITY: i8 = 0b0011_0000; /// **Index only**: return the least significant bit *(Default)* -pub const _SIDD_LEAST_SIGNIFICANT: i8 = 0b00000000; +pub const _SIDD_LEAST_SIGNIFICANT: i8 = 0b0000_0000; /// **Index only**: return the most significant bit -pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000; +pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b0100_0000; /// **Mask only**: return the bit mask -pub const _SIDD_BIT_MASK: i8 = 0b00000000; +pub const _SIDD_BIT_MASK: i8 = 0b0000_0000; /// **Mask only**: return the byte mask -pub const _SIDD_UNIT_MASK: i8 = 0b01000000; +pub const _SIDD_UNIT_MASK: i8 = 0b0100_0000; /// Compare packed strings with implicit lengths in `a` and `b` using the /// control in `imm8`, and return the generated mask. diff --git a/src/x86/ssse3.rs b/src/x86/ssse3.rs index b5c9d3ae9e..a5b6ab852a 100644 --- a/src/x86/ssse3.rs +++ b/src/x86/ssse3.rs @@ -1,3 +1,5 @@ +//! Supplemental Streaming SIMD Extensions 3 (SSSE3) + #[cfg(test)] use stdsimd_test::assert_instr; @@ -84,20 +86,17 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 { (a, b, n) }; - const fn add(a: u32, b: u32) -> u32 { - a + b - } macro_rules! shuffle { ($shift:expr) => { simd_shuffle16(b, a, [ - add(0, $shift), add(1, $shift), - add(2, $shift), add(3, $shift), - add(4, $shift), add(5, $shift), - add(6, $shift), add(7, $shift), - add(8, $shift), add(9, $shift), - add(10, $shift), add(11, $shift), - add(12, $shift), add(13, $shift), - add(14, $shift), add(15, $shift), + 0 + $shift, 1 + $shift, + 2 + $shift, 3 + $shift, + 4 + $shift, 5 + $shift, + 6 + $shift, 7 + $shift, + 8 + $shift, 9 + $shift, + 10 + $shift, 11 + $shift, + 12 + $shift, 13 + $shift, + 14 + $shift, 15 + $shift, ]) } } diff --git a/tests/cpu-detection.rs b/tests/cpu-detection.rs index 36cf7c97aa..294fd8ca7c 100644 --- a/tests/cpu-detection.rs +++ b/tests/cpu-detection.rs @@ -1,7 +1,9 @@ -#![cfg_attr(feature = "strict", deny(warnings))] #![feature(cfg_target_feature)] +#![cfg_attr(feature = "strict", deny(warnings))] +#![cfg_attr(feature = "cargo-clippy", allow(option_unwrap_used))] extern crate cupid; + #[macro_use] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] extern crate stdsimd;