diff --git a/.travis.yml b/.travis.yml
index c7a7bbc469..c91e4f34de 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,9 +21,13 @@ matrix:
       script: |
         cargo install rustfmt-nightly
         cargo fmt --all -- --write-mode=diff
+    - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
+      script: |
+        cargo install clippy
+        cargo clippy --all -- -D clippy-pedantic
   allow_failures:
     - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
-
+    - env: CLIPPY=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
 install:
   - if [ "$NO_ADD" == "" ]; then rustup target add $TARGET; fi
 
diff --git a/examples/nbody.rs b/examples/nbody.rs
index a9baa74ff7..0e5ea69437 100644
--- a/examples/nbody.rs
+++ b/examples/nbody.rs
@@ -1,12 +1,20 @@
+//! n-body benchmark from the [benchmarks game][bg].
+//!
+//! [bg]: https://benchmarksgame.alioth.debian.org/u64q/nbody-description.
+//! html#nbody
+
 #![cfg_attr(feature = "strict", deny(warnings))]
 #![feature(cfg_target_feature)]
 #![feature(target_feature)]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(similar_names, missing_docs_in_private_items,
+                  shadow_reuse, print_stdout))]
 
 extern crate stdsimd;
 use self::stdsimd::simd;
 use simd::f64x2;
 
-const PI: f64 = 3.141592653589793;
+const PI: f64 = std::f64::consts::PI;
 const SOLAR_MASS: f64 = 4.0 * PI * PI;
 const DAYS_PER_YEAR: f64 = 365.24;
 
@@ -29,7 +37,7 @@ impl Frsqrt for f64x2 {
                     f32x4::new(t.extract(0), t.extract(1), 0., 0.),
                 ).as_f64x4()
             };
-            f64x2::new(u.extract(0), u.extract(1))
+            Self::new(u.extract(0), u.extract(1))
         }
         #[cfg(all(any(target_arch = "arm", target_arch = "aarch64"),
                   target_feature = "neon"))]
@@ -61,8 +69,8 @@ struct Body {
 impl Body {
     fn new(
         x0: f64, x1: f64, x2: f64, v0: f64, v1: f64, v2: f64, mass: f64
-    ) -> Body {
-        Body {
+    ) -> Self {
+        Self {
             x: [x0, x1, x2],
             _fill: 0.0,
             v: [v0, v1, v2],
@@ -103,8 +111,8 @@ fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
 
     i = 0;
     while i < N {
-        for m in 0..3 {
-            dx[m] = f64x2::new(r[i][m], r[i + 1][m]);
+        for (m, dx) in dx.iter_mut().enumerate() {
+            *dx = f64x2::new(r[i][m], r[i + 1][m]);
         }
 
         dsquared = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2];
@@ -144,11 +152,10 @@ fn energy(bodies: &[Body; N_BODIES]) -> f64 {
         e += bi.mass
             * (bi.v[0] * bi.v[0] + bi.v[1] * bi.v[1] + bi.v[2] * bi.v[2])
             / 2.0;
-        for j in i + 1..N_BODIES {
-            let bj = &bodies[j];
+        for bj in bodies.iter().take(N_BODIES).skip(i + 1) {
             let mut dx = [0.0; 3];
-            for k in 0..3 {
-                dx[k] = bi.x[k] - bj.x[k];
+            for (k, dx) in dx.iter_mut().enumerate() {
+                *dx = bi.x[k] - bj.x[k];
             }
             let mut distance = 0.0;
             for &d in &dx {
@@ -210,7 +217,7 @@ fn main() {
         .nth(1)
         .expect("need one arg")
         .parse()
-        .unwrap();
+        .expect("argument should be a usize");
 
     offset_momentum(&mut bodies);
     println!("{:.9}", energy(&bodies));
diff --git a/examples/play.rs b/examples/play.rs
index 26ce5dd812..3957ae2116 100644
--- a/examples/play.rs
+++ b/examples/play.rs
@@ -1,5 +1,10 @@
 #![cfg_attr(feature = "strict", deny(warnings))]
 #![feature(target_feature)]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(similar_names, missing_docs_in_private_items,
+                  cast_sign_loss, cast_possible_truncation,
+                  cast_possible_wrap, option_unwrap_used, use_debug,
+                  print_stdout))]
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod example {
diff --git a/examples/types.rs b/examples/types.rs
index dc2a74e4fa..8bc0bc4e17 100644
--- a/examples/types.rs
+++ b/examples/types.rs
@@ -1,5 +1,8 @@
 #![cfg_attr(feature = "strict", deny(warnings))]
 #![feature(target_feature)]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(missing_docs_in_private_items, result_unwrap_used,
+                  option_unwrap_used, print_stdout, use_debug))]
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod example {
diff --git a/examples/wat.rs b/examples/wat.rs
index 076be129a1..5a70eed85f 100644
--- a/examples/wat.rs
+++ b/examples/wat.rs
@@ -1,5 +1,8 @@
 #![cfg_attr(feature = "strict", deny(warnings))]
 #![feature(target_feature)]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(missing_docs_in_private_items, result_unwrap_used,
+                  option_unwrap_used, print_stdout, use_debug))]
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod example {
diff --git a/src/lib.rs b/src/lib.rs
index 5c1333dc2d..7d6cc0916a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -26,7 +26,7 @@
 //! others at:
 //!
 //! * [i686](https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/)
-//! * [x86_64](https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/)
+//! * [`x86_64`](https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/)
 //! * [arm](https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/)
 //! * [aarch64](https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/)
 //!
@@ -122,6 +122,12 @@
            simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
            const_atomic_usize_new, stmt_expr_attributes)]
 #![cfg_attr(test, feature(proc_macro, test))]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(inline_always, too_many_arguments, cast_sign_loss,
+                  cast_lossless, cast_possible_wrap,
+                  cast_possible_truncation, cast_precision_loss,
+                  shadow_reuse, cyclomatic_complexity, similar_names,
+                  doc_markdown, many_single_char_names))]
 
 #[cfg(test)]
 extern crate stdsimd_test;
diff --git a/src/macros.rs b/src/macros.rs
index ab287a8000..3d4dbd798b 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -1,3 +1,5 @@
+//! Utility macros
+
 macro_rules! define_ty {
     ($name:ident, $($elty:ident),+) => {
         #[repr(simd)]
diff --git a/src/simd_llvm.rs b/src/simd_llvm.rs
index ec59fa7877..66a1cc8768 100644
--- a/src/simd_llvm.rs
+++ b/src/simd_llvm.rs
@@ -1,3 +1,7 @@
+//! LLVM's simd platform intrinsics
+//!
+//! TODO: should use `link_llvm_intrinsic` instead: issue #112
+
 extern "platform-intrinsic" {
     pub fn simd_eq<T, U>(x: T, y: T) -> U;
     pub fn simd_ne<T, U>(x: T, y: T) -> U;
diff --git a/src/v128.rs b/src/v128.rs
index f5c425bc74..9677cf9ab9 100644
--- a/src/v128.rs
+++ b/src/v128.rs
@@ -1,3 +1,5 @@
+//! 128-bit wide vector types
+
 use simd_llvm::*;
 
 define_ty! { f64x2, f64, f64 }
diff --git a/src/v256.rs b/src/v256.rs
index 33d2584f71..2687e18325 100644
--- a/src/v256.rs
+++ b/src/v256.rs
@@ -1,3 +1,5 @@
+//! 256-bit wide vector types
+
 use simd_llvm::*;
 
 define_ty! { f64x4, f64, f64, f64, f64 }
diff --git a/src/v512.rs b/src/v512.rs
index 4973a7001e..e763ef8ffa 100644
--- a/src/v512.rs
+++ b/src/v512.rs
@@ -1,3 +1,5 @@
+//! 512-bit wide vector types
+
 use simd_llvm::*;
 
 define_ty! { f64x8, f64, f64, f64, f64, f64, f64, f64, f64 }
diff --git a/src/v64.rs b/src/v64.rs
index fe7f59c2fd..9802fcf764 100644
--- a/src/v64.rs
+++ b/src/v64.rs
@@ -1,3 +1,5 @@
+//! 64-bit wide vector types
+
 use simd_llvm::*;
 
 define_ty_doc! {
diff --git a/src/x86/avx.rs b/src/x86/avx.rs
index dcf055179e..905e01449d 100644
--- a/src/x86/avx.rs
+++ b/src/x86/avx.rs
@@ -1,3 +1,18 @@
+//! Advanced Vector Extensions (AVX)
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
+//! Programmer's Manual, Volume 3: General-Purpose and System
+//! Instructions][amd64_ref].
+//!
+//! [Wikipedia][wiki] provides a quick overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+
 use std::mem;
 use std::ptr;
 
@@ -113,7 +128,7 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
             }
         }
     }
-    match (imm8 >> 0) & 0x1 {
+    match imm8 & 0x1 {
         0 => shuffle1!(0),
         _ => shuffle1!(1),
     }
@@ -161,7 +176,7 @@ pub unsafe fn _mm256_shuffle_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
             }
         }
     }
-    match (imm8 >> 0) & 0x3 {
+    match imm8 & 0x3 {
         0 => shuffle1!(0, 4),
         1 => shuffle1!(1, 5),
         2 => shuffle1!(2, 6),
@@ -594,69 +609,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
     mem::transmute(a ^ b)
 }
 
-// Equal (ordered, non-signaling)
+/// Equal (ordered, non-signaling)
 pub const _CMP_EQ_OQ: u8 = 0x00;
-// Less-than (ordered, signaling)
+/// Less-than (ordered, signaling)
 pub const _CMP_LT_OS: u8 = 0x01;
-// Less-than-or-equal (ordered, signaling)
+/// Less-than-or-equal (ordered, signaling)
 pub const _CMP_LE_OS: u8 = 0x02;
-// Unordered (non-signaling)
+/// Unordered (non-signaling)
 pub const _CMP_UNORD_Q: u8 = 0x03;
-// Not-equal (unordered, non-signaling)
+/// Not-equal (unordered, non-signaling)
 pub const _CMP_NEQ_UQ: u8 = 0x04;
-// Not-less-than (unordered, signaling)
+/// Not-less-than (unordered, signaling)
 pub const _CMP_NLT_US: u8 = 0x05;
-// Not-less-than-or-equal (unordered, signaling)
+/// Not-less-than-or-equal (unordered, signaling)
 pub const _CMP_NLE_US: u8 = 0x06;
-// Ordered (non-signaling)
+/// Ordered (non-signaling)
 pub const _CMP_ORD_Q: u8 = 0x07;
-// Equal (unordered, non-signaling)
+/// Equal (unordered, non-signaling)
 pub const _CMP_EQ_UQ: u8 = 0x08;
-// Not-greater-than-or-equal (unordered, signaling)
+/// Not-greater-than-or-equal (unordered, signaling)
 pub const _CMP_NGE_US: u8 = 0x09;
-// Not-greater-than (unordered, signaling)
+/// Not-greater-than (unordered, signaling)
 pub const _CMP_NGT_US: u8 = 0x0a;
-// False (ordered, non-signaling)
+/// False (ordered, non-signaling)
 pub const _CMP_FALSE_OQ: u8 = 0x0b;
-// Not-equal (ordered, non-signaling)
+/// Not-equal (ordered, non-signaling)
 pub const _CMP_NEQ_OQ: u8 = 0x0c;
-// Greater-than-or-equal (ordered, signaling)
+/// Greater-than-or-equal (ordered, signaling)
 pub const _CMP_GE_OS: u8 = 0x0d;
-// Greater-than (ordered, signaling)
+/// Greater-than (ordered, signaling)
 pub const _CMP_GT_OS: u8 = 0x0e;
-// True (unordered, non-signaling)
+/// True (unordered, non-signaling)
 pub const _CMP_TRUE_UQ: u8 = 0x0f;
-// Equal (ordered, signaling)
+/// Equal (ordered, signaling)
 pub const _CMP_EQ_OS: u8 = 0x10;
-// Less-than (ordered, non-signaling)
+/// Less-than (ordered, non-signaling)
 pub const _CMP_LT_OQ: u8 = 0x11;
-// Less-than-or-equal (ordered, non-signaling)
+/// Less-than-or-equal (ordered, non-signaling)
 pub const _CMP_LE_OQ: u8 = 0x12;
-// Unordered (signaling)
+/// Unordered (signaling)
 pub const _CMP_UNORD_S: u8 = 0x13;
-// Not-equal (unordered, signaling)
+/// Not-equal (unordered, signaling)
 pub const _CMP_NEQ_US: u8 = 0x14;
-// Not-less-than (unordered, non-signaling)
+/// Not-less-than (unordered, non-signaling)
 pub const _CMP_NLT_UQ: u8 = 0x15;
-// Not-less-than-or-equal (unordered, non-signaling)
+/// Not-less-than-or-equal (unordered, non-signaling)
 pub const _CMP_NLE_UQ: u8 = 0x16;
-// Ordered (signaling)
+/// Ordered (signaling)
 pub const _CMP_ORD_S: u8 = 0x17;
-// Equal (unordered, signaling)
+/// Equal (unordered, signaling)
 pub const _CMP_EQ_US: u8 = 0x18;
-// Not-greater-than-or-equal (unordered, non-signaling)
+/// Not-greater-than-or-equal (unordered, non-signaling)
 pub const _CMP_NGE_UQ: u8 = 0x19;
-// Not-greater-than (unordered, non-signaling)
+/// Not-greater-than (unordered, non-signaling)
 pub const _CMP_NGT_UQ: u8 = 0x1a;
-// False (ordered, signaling)
+/// False (ordered, signaling)
 pub const _CMP_FALSE_OS: u8 = 0x1b;
-// Not-equal (ordered, signaling)
+/// Not-equal (ordered, signaling)
 pub const _CMP_NEQ_OS: u8 = 0x1c;
-// Greater-than-or-equal (ordered, non-signaling)
+/// Greater-than-or-equal (ordered, non-signaling)
 pub const _CMP_GE_OQ: u8 = 0x1d;
-// Greater-than (ordered, non-signaling)
+/// Greater-than (ordered, non-signaling)
 pub const _CMP_GT_OQ: u8 = 0x1e;
-// True (unordered, signaling)
+/// True (unordered, signaling)
 pub const _CMP_TRUE_US: u8 = 0x1f;
 
 /// Compare packed double-precision (64-bit) floating-point
@@ -920,13 +935,10 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
     let imm8 = (imm8 & 0xFF) as u8;
-    const fn add4(x: u32) -> u32 {
-        x + 4
-    }
     macro_rules! shuffle4 {
         ($a:expr, $b:expr, $c:expr, $d:expr) => {
             simd_shuffle8(a, _mm256_undefined_ps(), [
-                $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d)
+                $a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4
             ])
         }
     }
@@ -960,7 +972,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
             }
         }
     }
-    match (imm8 >> 0) & 0b11 {
+    match imm8 & 0b11 {
         0b00 => shuffle1!(0),
         0b01 => shuffle1!(1),
         0b10 => shuffle1!(2),
@@ -1014,7 +1026,7 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
             }
         }
     }
-    match (imm8 >> 0) & 0b11 {
+    match imm8 & 0b11 {
         0b00 => shuffle1!(0),
         0b01 => shuffle1!(1),
         0b10 => shuffle1!(2),
@@ -1022,6 +1034,8 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
     }
 }
 
+/// Shuffle double-precision (64-bit) floating-point elements in `a`
+/// within 256-bit lanes using the control in `b`.
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vpermilpd))]
@@ -1074,7 +1088,7 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
             }
         }
     }
-    match (imm8 >> 0) & 0x1 {
+    match imm8 & 0x1 {
         0 => shuffle1!(0),
         _ => shuffle1!(1),
     }
@@ -1102,7 +1116,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
             }
         }
     }
-    match (imm8 >> 0) & 0x1 {
+    match imm8 & 0x1 {
         0 => shuffle1!(0),
         _ => shuffle1!(1),
     }
@@ -2750,8 +2764,7 @@ mod tests {
         let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
         let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
         let r = avx::_mm256_dp_ps(a, b, 0xFF);
-        let e =
-            f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
+        let e = f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
         assert_eq!(r, e);
     }
 
diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs
index d8671b3f57..58b04eeb45 100644
--- a/src/x86/avx2.rs
+++ b/src/x86/avx2.rs
@@ -1,6 +1,27 @@
+//! Advanced Vector Extensions 2 (AVX)
+//!
+//! AVX2 expands most AVX commands to 256-bit wide vector registers and
+//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
+//!
+//! The references are:
+//!
+//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
+//!   Instruction Set Reference, A-Z][intel64_ref].
+//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
+//!   System Instructions][amd64_ref].
+//!
+//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
+//! overview of the instructions available.
+//!
+//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
+//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
+//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
+//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
+
 use simd_llvm::simd_cast;
 use simd_llvm::{simd_shuffle2, simd_shuffle4, simd_shuffle8};
 use simd_llvm::{simd_shuffle16, simd_shuffle32};
+
 use v256::*;
 use v128::*;
 use x86::__m256i;
@@ -116,28 +137,25 @@ pub unsafe fn _mm256_alignr_epi8(a: i8x32, b: i8x32, n: i32) -> i8x32 {
         (a, b, n)
     };
 
-    const fn add(a: u32, b: u32) -> u32 {
-        a + b
-    }
     macro_rules! shuffle {
         ($shift:expr) => {
             simd_shuffle32(b, a, [
-                add(0, $shift), add(1, $shift),
-                add(2, $shift), add(3, $shift),
-                add(4, $shift), add(5, $shift),
-                add(6, $shift), add(7, $shift),
-                add(8, $shift), add(9, $shift),
-                add(10, $shift), add(11, $shift),
-                add(12, $shift), add(13, $shift),
-                add(14, $shift), add(15, $shift),
-                add(16, $shift), add(17, $shift),
-                add(18, $shift), add(19, $shift),
-                add(20, $shift), add(21, $shift),
-                add(22, $shift), add(23, $shift),
-                add(24, $shift), add(25, $shift),
-                add(26, $shift), add(27, $shift),
-                add(28, $shift), add(29, $shift),
-                add(30, $shift), add(31, $shift),
+                0 + $shift, 1 + $shift,
+                2 + $shift, 3 + $shift,
+                4 + $shift, 5 + $shift,
+                6 + $shift, 7 + $shift,
+                8 + $shift, 9 + $shift,
+                10 + $shift, 11 + $shift,
+                12 + $shift, 13 + $shift,
+                14 + $shift, 15 + $shift,
+                16 + $shift, 17 + $shift,
+                18 + $shift, 19 + $shift,
+                20 + $shift, 21 + $shift,
+                22 + $shift, 23 + $shift,
+                24 + $shift, 25 + $shift,
+                26 + $shift, 27 + $shift,
+                28 + $shift, 29 + $shift,
+                30 + $shift, 31 + $shift,
             ])
         }
     }
@@ -340,7 +358,7 @@ pub unsafe fn _mm256_blendv_epi8(a: i8x32, b: i8x32, mask: __m256i) -> i8x32 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 {
-    simd_shuffle16(a, i8x16::splat(0i8), [0u32; 16])
+    simd_shuffle16(a, i8x16::splat(0_i8), [0_u32; 16])
 }
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
@@ -349,7 +367,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: i8x16) -> i8x16 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 {
-    simd_shuffle32(a, i8x16::splat(0i8), [0u32; 32])
+    simd_shuffle32(a, i8x16::splat(0_i8), [0_u32; 32])
 }
 
 // NB: simd_shuffle4 with integer data types for `a` and `b` is
@@ -360,7 +378,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: i8x16) -> i8x32 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 {
-    simd_shuffle4(a, i32x4::splat(0i32), [0u32; 4])
+    simd_shuffle4(a, i32x4::splat(0_i32), [0_u32; 4])
 }
 
 // NB: simd_shuffle4 with integer data types for `a` and `b` is
@@ -371,7 +389,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: i32x4) -> i32x4 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 {
-    simd_shuffle8(a, i32x4::splat(0i32), [0u32; 8])
+    simd_shuffle8(a, i32x4::splat(0_i32), [0_u32; 8])
 }
 
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
@@ -380,7 +398,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: i32x4) -> i32x8 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
 pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 {
-    simd_shuffle2(a, i64x2::splat(0i64), [0u32; 2])
+    simd_shuffle2(a, i64x2::splat(0_i64), [0_u32; 2])
 }
 
 // NB: simd_shuffle4 with integer data types for `a` and `b` is
@@ -391,7 +409,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: i64x2) -> i64x2 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 {
-    simd_shuffle4(a, i64x2::splat(0i64), [0u32; 4])
+    simd_shuffle4(a, i64x2::splat(0_i64), [0_u32; 4])
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element
@@ -400,7 +418,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: i64x2) -> i64x4 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 {
-    simd_shuffle2(a, f64x2::splat(0f64), [0u32; 2])
+    simd_shuffle2(a, f64x2::splat(0_f64), [0_u32; 2])
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element
@@ -409,7 +427,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: f64x2) -> f64x2 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 {
-    simd_shuffle4(a, f64x2::splat(0f64), [0u32; 4])
+    simd_shuffle4(a, f64x2::splat(0_f64), [0_u32; 4])
 }
 
 // NB: broadcastsi128_si256 is often compiled to vinsertf128 or
@@ -419,7 +437,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: f64x2) -> f64x4 {
 #[inline(always)]
 #[target_feature = "+avx2"]
 pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 {
-    simd_shuffle4(a, i64x2::splat(0i64), [0, 1, 0, 1])
+    simd_shuffle4(a, i64x2::splat(0_i64), [0, 1, 0, 1])
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element
@@ -428,7 +446,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: i64x2) -> i64x4 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 {
-    simd_shuffle4(a, f32x4::splat(0f32), [0u32; 4])
+    simd_shuffle4(a, f32x4::splat(0_f32), [0_u32; 4])
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element
@@ -437,7 +455,7 @@ pub unsafe fn _mm_broadcastss_ps(a: f32x4) -> f32x4 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 {
-    simd_shuffle8(a, f32x4::splat(0f32), [0u32; 8])
+    simd_shuffle8(a, f32x4::splat(0_f32), [0_u32; 8])
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
@@ -446,7 +464,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: f32x4) -> f32x8 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 {
-    simd_shuffle8(a, i16x8::splat(0i16), [0u32; 8])
+    simd_shuffle8(a, i16x8::splat(0_i16), [0_u32; 8])
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
@@ -455,7 +473,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: i16x8) -> i16x8 {
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm256_broadcastw_epi16(a: i16x8) -> i16x16 {
-    simd_shuffle16(a, i16x8::splat(0i16), [0u32; 16])
+    simd_shuffle16(a, i16x8::splat(0_i16), [0_u32; 16])
 }
 
 // TODO _mm256_bslli_epi128
@@ -565,8 +583,8 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: i8x16) -> i32x8 {
     simd_cast::<::v64::i8x8, _>(simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]))
 }
 
-// An i8x4 type is pretty useless, but we need it as an intermediate type in
-// _mm256_cvtepi8_epi64.
+/// An i8x4 type is pretty useless, but we need it as an intermediate type in
+/// _mm256_cvtepi8_epi64.
 #[repr(simd)]
 #[allow(non_camel_case_types)]
 struct i8x4(i8, i8, i8, i8);
diff --git a/src/x86/bmi.rs b/src/x86/bmi.rs
index 2cc86d8009..8809f63854 100644
--- a/src/x86/bmi.rs
+++ b/src/x86/bmi.rs
@@ -21,7 +21,7 @@ use stdsimd_test::assert_instr;
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(bextr))]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
-    _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
+    _bextr2_u32(a, (start & 0xff_u32) | ((len & 0xff_u32) << 8_u32))
 }
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
@@ -31,7 +31,7 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
 pub unsafe fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
-    _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
+    _bextr2_u64(a, (start & 0xff_u64) | ((len & 0xff_u64) << 8_u64))
 }
 
 /// Extracts bits of `a` specified by `control` into
@@ -97,7 +97,7 @@ pub unsafe fn _blsi_u64(x: u64) -> u64 {
 #[target_feature = "+bmi"]
 #[cfg_attr(test, assert_instr(blsmsk))]
 pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
-    x ^ (x.wrapping_sub(1u32))
+    x ^ (x.wrapping_sub(1_u32))
 }
 
 /// Get mask up to lowest set bit.
@@ -106,7 +106,7 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
 pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
-    x ^ (x.wrapping_sub(1u64))
+    x ^ (x.wrapping_sub(1_u64))
 }
 
 /// Resets the lowest set bit of `x`.
diff --git a/src/x86/macros.rs b/src/x86/macros.rs
index e835fe2107..f268a3499b 100644
--- a/src/x86/macros.rs
+++ b/src/x86/macros.rs
@@ -1,3 +1,5 @@
+//! Utility macros.
+
 macro_rules! constify_imm8 {
     ($imm8:expr, $expand:ident) => {
         #[allow(overflowing_literals)]
diff --git a/src/x86/mod.rs b/src/x86/mod.rs
index a046c453c7..ba84f9d890 100644
--- a/src/x86/mod.rs
+++ b/src/x86/mod.rs
@@ -1,3 +1,5 @@
+//! `x86` and `x86_64` intrinsics.
+
 pub use self::sse::*;
 pub use self::sse2::*;
 pub use self::sse3::*;
@@ -14,8 +16,10 @@ pub use self::tbm::*;
 
 pub use self::runtime::{__unstable_detect_feature, __Feature};
 
+/// 128-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
 pub type __m128i = ::v128::i8x16;
+/// 256-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
 pub type __m256i = ::v256::i8x32;
 
diff --git a/src/x86/runtime.rs b/src/x86/runtime.rs
index ef78d0a317..4b7e3aa56e 100644
--- a/src/x86/runtime.rs
+++ b/src/x86/runtime.rs
@@ -131,21 +131,18 @@ pub enum __Feature {
     #[doc(hidden)] __NonExhaustive,
 }
 
+/// Sets the `bit`-th bit of `x`.
 fn set_bit(x: usize, bit: u32) -> usize {
     debug_assert!(32 > bit);
     x | 1 << bit
 }
 
+/// Tests the `bit`-th bit of `x`.
 fn test_bit(x: usize, bit: u32) -> bool {
     debug_assert!(32 > bit);
     x & (1 << bit) != 0
 }
 
-fn inv_test_bit(v: usize, idx: u32) -> bool {
-    debug_assert!(32 > idx);
-    ((v >> idx) & 1) != 0
-}
-
 /// Run-time feature detection on x86 works by using the CPUID instruction.
 ///
 /// The [CPUID Wikipedia page][wiki_cpuid] contains
@@ -174,7 +171,7 @@ fn detect_features() -> usize {
         /// below).
         asm!("cpuid"
              : "={ecx}"(proc_info_ecx), "={edx}"(proc_info_edx)
-             : "{eax}"(0x00000001u32), "{ecx}"(0 as u32)
+             : "{eax}"(0x0000_0001_u32), "{ecx}"(0 as u32)
              : :);
 
         /// 2. EAX=7, ECX=0: Queries "Extended Features"
@@ -182,48 +179,48 @@ fn detect_features() -> usize {
         /// (see below); the result in ECX is not currently needed.
         asm!("cpuid"
              : "={ebx}"(extended_features_ebx)
-             : "{eax}"(0x00000007u32), "{ecx}"(0 as u32)
+             : "{eax}"(0x0000_0007_u32), "{ecx}"(0 as u32)
              : :);
     }
 
     let mut value: usize = 0;
 
-    if inv_test_bit(extended_features_ebx, 3) {
+    if test_bit(extended_features_ebx, 3) {
         value = set_bit(value, __Feature::bmi as u32);
     }
-    if inv_test_bit(extended_features_ebx, 8) {
+    if test_bit(extended_features_ebx, 8) {
         value = set_bit(value, __Feature::bmi2 as u32);
     }
 
-    if inv_test_bit(proc_info_ecx, 0) {
+    if test_bit(proc_info_ecx, 0) {
         value = set_bit(value, __Feature::sse3 as u32);
     }
-    if inv_test_bit(proc_info_ecx, 5) {
+    if test_bit(proc_info_ecx, 5) {
         value = set_bit(value, __Feature::abm as u32);
     }
-    if inv_test_bit(proc_info_ecx, 9) {
+    if test_bit(proc_info_ecx, 9) {
         value = set_bit(value, __Feature::ssse3 as u32);
     }
-    if inv_test_bit(proc_info_ecx, 12) {
+    if test_bit(proc_info_ecx, 12) {
         value = set_bit(value, __Feature::fma as u32);
     }
-    if inv_test_bit(proc_info_ecx, 19) {
+    if test_bit(proc_info_ecx, 19) {
         value = set_bit(value, __Feature::sse4_1 as u32);
     }
-    if inv_test_bit(proc_info_ecx, 20) {
+    if test_bit(proc_info_ecx, 20) {
         value = set_bit(value, __Feature::sse4_2 as u32);
     }
-    if inv_test_bit(proc_info_ecx, 21) {
+    if test_bit(proc_info_ecx, 21) {
         value = set_bit(value, __Feature::tbm as u32);
     }
-    if inv_test_bit(proc_info_ecx, 23) {
+    if test_bit(proc_info_ecx, 23) {
         value = set_bit(value, __Feature::popcnt as u32);
     }
 
-    if inv_test_bit(proc_info_edx, 25) {
+    if test_bit(proc_info_edx, 25) {
         value = set_bit(value, __Feature::sse as u32);
     }
-    if inv_test_bit(proc_info_edx, 26) {
+    if test_bit(proc_info_edx, 26) {
         value = set_bit(value, __Feature::sse2 as u32);
     }
 
@@ -235,7 +232,9 @@ fn detect_features() -> usize {
     // - https://hg.mozilla.
     // org/mozilla-central/file/64bab5cbb9b6/mozglue/build/SSE.cpp#l190
     //
-    if inv_test_bit(proc_info_ecx, 26) && inv_test_bit(proc_info_ecx, 27) {
+    if test_bit(proc_info_ecx, 26) && test_bit(proc_info_ecx, 27) {
+        /// XGETBV: reads the contents of the extended control
+        /// register (XCR).
         unsafe fn xgetbv(xcr_no: u32) -> u64 {
             let eax: u32;
             let edx: u32;
@@ -249,10 +248,10 @@ fn detect_features() -> usize {
 
         // This is safe because on x86 `xgetbv` is always available.
         if unsafe { xgetbv(0) } & 6 == 6 {
-            if inv_test_bit(proc_info_ecx, 28) {
+            if test_bit(proc_info_ecx, 28) {
                 value = set_bit(value, __Feature::avx as u32);
             }
-            if inv_test_bit(extended_features_ebx, 5) {
+            if test_bit(extended_features_ebx, 5) {
                 value = set_bit(value, __Feature::avx2 as u32);
             }
         }
diff --git a/src/x86/sse.rs b/src/x86/sse.rs
index 5633d39c0e..49e10efdd1 100644
--- a/src/x86/sse.rs
+++ b/src/x86/sse.rs
@@ -1,3 +1,5 @@
+//! Streaming SIMD Extensions (SSE)
+
 use simd_llvm::simd_shuffle4;
 use v128::*;
 use v64::f32x2;
@@ -1136,7 +1138,7 @@ pub unsafe fn _mm_storeh_pi(p: *mut u64, a: f32x4) {
         // is just silly
         let a64: u64x2 = mem::transmute(a);
         let a_hi = a64.extract(1);
-        *p = mem::transmute(a_hi);
+        *p = a_hi;
     } else {
         // target_arch = "x86_64"
         // If this is a `u64x2` LLVM generates a pshufd + movq, but we really
@@ -1167,7 +1169,7 @@ pub unsafe fn _mm_storel_pi(p: *mut u64, a: f32x4) {
         // stack.
         let a64: u64x2 = mem::transmute(a);
         let a_hi = a64.extract(0);
-        *p = mem::transmute(a_hi);
+        *p = a_hi;
     } else {
         // target_arch = "x86_64"
         let a64: f64x2 = mem::transmute(a);
@@ -1306,7 +1308,7 @@ pub unsafe fn _mm_sfence() {
 #[target_feature = "+sse"]
 #[cfg_attr(test, assert_instr(stmxcsr))]
 pub unsafe fn _mm_getcsr() -> u32 {
-    let mut result = 0i32;
+    let mut result = 0_i32;
     stmxcsr((&mut result) as *mut _ as *mut i8);
     result as u32
 }
@@ -1455,6 +1457,7 @@ pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
 pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
 pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
+/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
 pub const _MM_EXCEPT_MASK: u32 = 0x003f;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
@@ -1469,6 +1472,7 @@ pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
 pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
 pub const _MM_MASK_INEXACT: u32 = 0x1000;
+/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
 pub const _MM_MASK_MASK: u32 = 0x1f80;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
@@ -1479,14 +1483,18 @@ pub const _MM_ROUND_DOWN: u32 = 0x2000;
 pub const _MM_ROUND_UP: u32 = 0x4000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
 pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
+
+/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
 pub const _MM_ROUND_MASK: u32 = 0x6000;
 
+/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
 pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
 pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1494,6 +1502,7 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
     _mm_getcsr() & _MM_MASK_MASK
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1501,6 +1510,7 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
     _mm_getcsr() & _MM_EXCEPT_MASK
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1508,6 +1518,7 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1515,6 +1526,7 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
     _mm_getcsr() & _MM_ROUND_MASK
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1522,6 +1534,7 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1529,6 +1542,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
@@ -1538,6 +1552,7 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
     _mm_setcsr(val)
 }
 
+/// See [`_mm_setcsr`](fn._mm_setcsr.html)
 #[inline(always)]
 #[allow(non_snake_case)]
 #[target_feature = "+sse"]
diff --git a/src/x86/sse2.rs b/src/x86/sse2.rs
index c08fab33b6..264075ff43 100644
--- a/src/x86/sse2.rs
+++ b/src/x86/sse2.rs
@@ -1,3 +1,5 @@
+//! Streaming SIMD Extensions 2 (SSE2)
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -316,20 +318,17 @@ pub unsafe fn _mm_subs_epu16(a: u16x8, b: u16x8) -> u16x8 {
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
     let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
-    const fn sub(a: u32, b: u32) -> u32 {
-        a - b
-    }
     macro_rules! shuffle {
         ($shift:expr) => {
             simd_shuffle16::<__m128i, __m128i>(zero, a, [
-                sub(16, $shift), sub(17, $shift),
-                sub(18, $shift), sub(19, $shift),
-                sub(20, $shift), sub(21, $shift),
-                sub(22, $shift), sub(23, $shift),
-                sub(24, $shift), sub(25, $shift),
-                sub(26, $shift), sub(27, $shift),
-                sub(28, $shift), sub(29, $shift),
-                sub(30, $shift), sub(31, $shift),
+                16 - $shift, 17 - $shift,
+                18 - $shift, 19 - $shift,
+                20 - $shift, 21 - $shift,
+                22 - $shift, 23 - $shift,
+                24 - $shift, 25 - $shift,
+                26 - $shift, 27 - $shift,
+                28 - $shift, 29 - $shift,
+                30 - $shift, 31 - $shift,
             ])
         }
     }
@@ -463,20 +462,17 @@ pub unsafe fn _mm_sra_epi32(a: i32x4, count: i32x4) -> i32x4 {
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
     let (zero, imm8) = (__m128i::splat(0), imm8 as u32);
-    const fn add(a: u32, b: u32) -> u32 {
-        a + b
-    }
     macro_rules! shuffle {
         ($shift:expr) => {
             simd_shuffle16::<__m128i, __m128i>(a, zero, [
-                add(0, $shift), add(1, $shift),
-                add(2, $shift), add(3, $shift),
-                add(4, $shift), add(5, $shift),
-                add(6, $shift), add(7, $shift),
-                add(8, $shift), add(9, $shift),
-                add(10, $shift), add(11, $shift),
-                add(12, $shift), add(13, $shift),
-                add(14, $shift), add(15, $shift),
+                0 + $shift, 1 + $shift,
+                2 + $shift, 3 + $shift,
+                4 + $shift, 5 + $shift,
+                6 + $shift, 7 + $shift,
+                8 + $shift, 9 + $shift,
+                10 + $shift, 11 + $shift,
+                12 + $shift, 13 + $shift,
+                14 + $shift, 15 + $shift,
             ])
         }
     }
@@ -1102,14 +1098,10 @@ pub unsafe fn _mm_shuffle_epi32(a: i32x4, imm8: i32) -> i32x4 {
 pub unsafe fn _mm_shufflehi_epi16(a: i16x8, imm8: i32) -> i16x8 {
     // See _mm_shuffle_epi32.
     let imm8 = (imm8 & 0xFF) as u8;
-    const fn add4(x: u32) -> u32 {
-        x + 4
-    }
-
     macro_rules! shuffle_done {
         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
             simd_shuffle8(a, a, [
-                0, 1, 2, 3, add4($x01), add4($x23), add4($x45), add4($x67),
+                0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4,
             ])
         }
     }
@@ -1657,7 +1649,7 @@ pub unsafe fn _mm_cmpnge_pd(a: f64x2, b: f64x2) -> f64x2 {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comieqsd(a, b) as u8)
+    comieqsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
@@ -1665,7 +1657,7 @@ pub unsafe fn _mm_comieq_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comiltsd(a, b) as u8)
+    comiltsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
@@ -1673,7 +1665,7 @@ pub unsafe fn _mm_comilt_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comilesd(a, b) as u8)
+    comilesd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
@@ -1681,7 +1673,7 @@ pub unsafe fn _mm_comile_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comigtsd(a, b) as u8)
+    comigtsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
@@ -1689,7 +1681,7 @@ pub unsafe fn _mm_comigt_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comigesd(a, b) as u8)
+    comigesd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
@@ -1697,7 +1689,7 @@ pub unsafe fn _mm_comige_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(comineqsd(a, b) as u8)
+    comineqsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for equality.
@@ -1705,7 +1697,7 @@ pub unsafe fn _mm_comineq_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomieqsd(a, b) as u8)
+    ucomieqsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
@@ -1713,7 +1705,7 @@ pub unsafe fn _mm_ucomieq_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomiltsd(a, b) as u8)
+    ucomiltsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
@@ -1721,7 +1713,7 @@ pub unsafe fn _mm_ucomilt_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomilesd(a, b) as u8)
+    ucomilesd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
@@ -1729,7 +1721,7 @@ pub unsafe fn _mm_ucomile_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomigtsd(a, b) as u8)
+    ucomigtsd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
@@ -1737,7 +1729,7 @@ pub unsafe fn _mm_ucomigt_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomigesd(a, b) as u8)
+    ucomigesd(a, b) as u8 != 0
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
@@ -1745,7 +1737,7 @@ pub unsafe fn _mm_ucomige_sd(a: f64x2, b: f64x2) -> bool {
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomineq_sd(a: f64x2, b: f64x2) -> bool {
-    mem::transmute(ucomineqsd(a, b) as u8)
+    ucomineqsd(a, b) as u8 != 0
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in "a" to
@@ -1894,7 +1886,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> f64x2 {
     f64x2::new(a, b)
 }
 
-/// returns packed double-precision (64-bit) floating-point elements with all
+/// Returns packed double-precision (64-bit) floating-point elements with all
 /// zeros.
 #[inline(always)]
 #[target_feature = "+sse2"]
@@ -1913,8 +1905,6 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
     movmskpd(a)
 }
 
-
-
 /// Load 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
diff --git a/src/x86/sse3.rs b/src/x86/sse3.rs
index cba3129d1b..29cd98194b 100644
--- a/src/x86/sse3.rs
+++ b/src/x86/sse3.rs
@@ -1,3 +1,5 @@
+//! Streaming SIMD Extensions 3 (SSE3)
+
 use x86::__m128i;
 use simd_llvm::{simd_shuffle2, simd_shuffle4};
 use v128::*;
diff --git a/src/x86/sse41.rs b/src/x86/sse41.rs
index 90be986f8e..c0604b7a3e 100644
--- a/src/x86/sse41.rs
+++ b/src/x86/sse41.rs
@@ -1,3 +1,4 @@
+//! Streaming SIMD Extensions 4.1 (SSE4.1)
 
 use std::mem;
 
@@ -22,18 +23,28 @@ pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
 /// suppress exceptions
 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
 /// round to nearest and do not suppress exceptions
-pub const _MM_FROUND_NINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT);
+pub const _MM_FROUND_NINT: i32 = 0x00;
 /// round down and do not suppress exceptions
-pub const _MM_FROUND_FLOOR: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
+pub const _MM_FROUND_FLOOR: i32 =
+    (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
 /// round up and do not suppress exceptions
-pub const _MM_FROUND_CEIL: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
+pub const _MM_FROUND_CEIL: i32 =
+    (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
 /// truncate and do not suppress exceptions
 pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
-/// use MXCSR.RC and do not suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
-pub const _MM_FROUND_RINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
+/// use MXCSR.RC and do not suppress exceptions; see
+/// `vendor::_MM_SET_ROUNDING_MODE`
+pub const _MM_FROUND_RINT: i32 =
+    (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
-pub const _MM_FROUND_NEARBYINT: i32 = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
+pub const _MM_FROUND_NEARBYINT: i32 =
+    (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
 
+/// Blend packed 8-bit integers from `a` and `b` using `mask`
+///
+/// The high bit of each corresponding mask byte determines the selection.
+/// If the high bit is set the element of `a` is selected. The element
+/// of `b` is selected otherwise.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendvb))]
@@ -41,6 +52,11 @@ pub unsafe fn _mm_blendv_epi8(a: i8x16, b: i8x16, mask: i8x16) -> i8x16 {
     pblendvb(a, b, mask)
 }
 
+/// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
+///
+/// The mask bits determine the selection. A clear bit selects the
+/// corresponding element of `a`, and a set bit the corresponding
+/// element of `b`.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
@@ -209,8 +225,8 @@ pub unsafe fn _mm_max_epu16(a: u16x8, b: u16x8) -> u16x8 {
     pmaxuw(a, b)
 }
 
-// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
-// values.
+/// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
+/// values.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxsd, imm8 = 0))]
@@ -218,8 +234,8 @@ pub unsafe fn _mm_max_epi32(a: i32x4, b: i32x4) -> i32x4 {
     pmaxsd(a, b)
 }
 
-// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
-// maximum values.
+/// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
+/// maximum values.
 #[inline(always)]
 #[target_feature = "+sse4.1"]
 #[cfg_attr(test, assert_instr(pmaxud, imm8 = 0))]
@@ -356,11 +372,17 @@ pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 {
 ///
 /// ```
 /// use stdsimd::vendor;
-/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);     // round down, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);     // round up, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);        // truncate, and suppress exceptions
-/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+///
+/// // round to nearest, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
+/// // round down, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);
+/// // round up, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);
+/// // truncate, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);
+/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
+/// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
 #[inline(always)]
 #[target_feature = "+sse4.1"]
@@ -379,11 +401,17 @@ pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 {
 ///
 /// ```
 /// use stdsimd::vendor;
-/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);     // round down, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);     // round up, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);        // truncate, and suppress exceptions
-/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+///
+/// // round to nearest, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
+/// // round down, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);
+/// // round up, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);
+/// // truncate, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);
+/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
+/// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
 #[inline(always)]
 #[target_feature = "+sse4.1"]
@@ -404,11 +432,17 @@ pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 {
 ///
 /// ```
 /// use stdsimd::vendor;
-/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);     // round down, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);     // round up, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);        // truncate, and suppress exceptions
-/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+///
+/// // round to nearest, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
+/// // round down, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);
+/// // round up, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);
+/// // truncate, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);
+/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
+/// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
 #[inline(always)]
 #[target_feature = "+sse4.1"]
@@ -429,11 +463,17 @@ pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 {
 ///
 /// ```
 /// use stdsimd::vendor;
-/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC); // round to nearest, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);     // round down, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);     // round up, and suppress exceptions
-/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);        // truncate, and suppress exceptions
-/// vendor::_MM_FROUND_CUR_DIRECTION; // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+///
+/// // round to nearest, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
+/// // round down, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_NEG_INF |vendor::_MM_FROUND_NO_EXC);
+/// // round up, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_POS_INF |vendor::_MM_FROUND_NO_EXC);
+/// // truncate, and suppress exceptions:
+/// (vendor::_MM_FROUND_TO_ZERO |vendor::_MM_FROUND_NO_EXC);
+/// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
+/// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
 #[inline(always)]
 #[target_feature = "+sse4.1"]
diff --git a/src/x86/sse42.rs b/src/x86/sse42.rs
index eb551684c9..0a1be2e947 100644
--- a/src/x86/sse42.rs
+++ b/src/x86/sse42.rs
@@ -1,3 +1,7 @@
+//! Streaming SIMD Extensions 4.2 (SSE4.2)
+//!
+//! Extends SSE4.1 with STTNI (String and Text New Instructions).
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -5,42 +9,42 @@ use v128::*;
 use x86::__m128i;
 
 /// String contains unsigned 8-bit characters *(Default)*
-pub const _SIDD_UBYTE_OPS: i8 = 0b00000000;
+pub const _SIDD_UBYTE_OPS: i8 = 0b0000_0000;
 /// String contains unsigned 16-bit characters
-pub const _SIDD_UWORD_OPS: i8 = 0b00000001;
+pub const _SIDD_UWORD_OPS: i8 = 0b0000_0001;
 /// String contains signed 8-bit characters
-pub const _SIDD_SBYTE_OPS: i8 = 0b00000010;
+pub const _SIDD_SBYTE_OPS: i8 = 0b0000_0010;
 /// String contains unsigned 16-bit characters
-pub const _SIDD_SWORD_OPS: i8 = 0b00000011;
+pub const _SIDD_SWORD_OPS: i8 = 0b0000_0011;
 
 /// For each character in `a`, find if it is in `b` *(Default)*
-pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b00000000;
-/// For each character in `a`, determine if `b[0] <= c <= b[1] or b[1] <= c <=
-/// b[2]...`
-pub const _SIDD_CMP_RANGES: i8 = 0b00000100;
+pub const _SIDD_CMP_EQUAL_ANY: i8 = 0b0000_0000;
+/// For each character in `a`, determine if
+/// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+pub const _SIDD_CMP_RANGES: i8 = 0b0000_0100;
 /// The strings defined by `a` and `b` are equal
-pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b00001000;
+pub const _SIDD_CMP_EQUAL_EACH: i8 = 0b0000_1000;
 /// Search for the defined substring in the target
-pub const _SIDD_CMP_EQUAL_ORDERED: i8 = 0b00001100;
+pub const _SIDD_CMP_EQUAL_ORDERED: i8 = 0b0000_1100;
 
 /// Do not negate results *(Default)*
-pub const _SIDD_POSITIVE_POLARITY: i8 = 0b00000000;
+pub const _SIDD_POSITIVE_POLARITY: i8 = 0b0000_0000;
 /// Negate results
-pub const _SIDD_NEGATIVE_POLARITY: i8 = 0b00010000;
+pub const _SIDD_NEGATIVE_POLARITY: i8 = 0b0001_0000;
 /// Do not negate results before the end of the string
-pub const _SIDD_MASKED_POSITIVE_POLARITY: i8 = 0b00100000;
+pub const _SIDD_MASKED_POSITIVE_POLARITY: i8 = 0b0010_0000;
 /// Negate results only before the end of the string
-pub const _SIDD_MASKED_NEGATIVE_POLARITY: i8 = 0b00110000;
+pub const _SIDD_MASKED_NEGATIVE_POLARITY: i8 = 0b0011_0000;
 
 /// **Index only**: return the least significant bit *(Default)*
-pub const _SIDD_LEAST_SIGNIFICANT: i8 = 0b00000000;
+pub const _SIDD_LEAST_SIGNIFICANT: i8 = 0b0000_0000;
 /// **Index only**: return the most significant bit
-pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b01000000;
+pub const _SIDD_MOST_SIGNIFICANT: i8 = 0b0100_0000;
 
 /// **Mask only**: return the bit mask
-pub const _SIDD_BIT_MASK: i8 = 0b00000000;
+pub const _SIDD_BIT_MASK: i8 = 0b0000_0000;
 /// **Mask only**: return the byte mask
-pub const _SIDD_UNIT_MASK: i8 = 0b01000000;
+pub const _SIDD_UNIT_MASK: i8 = 0b0100_0000;
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
diff --git a/src/x86/ssse3.rs b/src/x86/ssse3.rs
index b5c9d3ae9e..a5b6ab852a 100644
--- a/src/x86/ssse3.rs
+++ b/src/x86/ssse3.rs
@@ -1,3 +1,5 @@
+//! Supplemental Streaming SIMD Extensions 3 (SSSE3)
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
@@ -84,20 +86,17 @@ pub unsafe fn _mm_alignr_epi8(a: i8x16, b: i8x16, n: i32) -> i8x16 {
         (a, b, n)
     };
 
-    const fn add(a: u32, b: u32) -> u32 {
-        a + b
-    }
     macro_rules! shuffle {
         ($shift:expr) => {
             simd_shuffle16(b, a, [
-                add(0, $shift), add(1, $shift),
-                add(2, $shift), add(3, $shift),
-                add(4, $shift), add(5, $shift),
-                add(6, $shift), add(7, $shift),
-                add(8, $shift), add(9, $shift),
-                add(10, $shift), add(11, $shift),
-                add(12, $shift), add(13, $shift),
-                add(14, $shift), add(15, $shift),
+                0 + $shift, 1 + $shift,
+                2 + $shift, 3 + $shift,
+                4 + $shift, 5 + $shift,
+                6 + $shift, 7 + $shift,
+                8 + $shift, 9 + $shift,
+                10 + $shift, 11 + $shift,
+                12 + $shift, 13 + $shift,
+                14 + $shift, 15 + $shift,
             ])
         }
     }
diff --git a/tests/cpu-detection.rs b/tests/cpu-detection.rs
index 36cf7c97aa..294fd8ca7c 100644
--- a/tests/cpu-detection.rs
+++ b/tests/cpu-detection.rs
@@ -1,7 +1,9 @@
-#![cfg_attr(feature = "strict", deny(warnings))]
 #![feature(cfg_target_feature)]
+#![cfg_attr(feature = "strict", deny(warnings))]
+#![cfg_attr(feature = "cargo-clippy", allow(option_unwrap_used))]
 
 extern crate cupid;
+
 #[macro_use]
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 extern crate stdsimd;