diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index 0643e240fd..227d227f4e 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -8,7 +8,7 @@ pub use self::generated::*;
 // FIXME: replace neon with asimd
 
 use crate::{
-    core_arch::{arm::*, simd_llvm::*},
+    core_arch::{arm::*, simd::*, simd_llvm::*},
     mem::{transmute, zeroed},
 };
 #[cfg(test)]
@@ -19,14 +19,6 @@ types! {
     pub struct float64x1_t(f64); // FIXME: check this!
     /// ARM-specific 128-bit wide vector of two packed `f64`.
     pub struct float64x2_t(f64, f64);
-    /// ARM-specific 64-bit wide vector of one packed `p64`.
-    pub struct poly64x1_t(i64); // FIXME: check this!
-    /// ARM-specific 64-bit wide vector of one packed `p64`.
-    pub struct poly64_t(i64); // FIXME: check this!
-    /// ARM-specific 64-bit wide vector of two packed `p64`.
-    pub struct poly64x2_t(i64, i64); // FIXME: check this!
-    /// ARM-specific 128-bit wide vector of one packed `p64`.
-    pub struct poly128_t(i128); // FIXME: check this!
 }
 
 /// ARM-specific type containing two `int8x16_t` vectors.
@@ -360,6 +352,333 @@ extern "C" {
     fn vsriq_n_s64_(a: int64x2_t, b: int64x2_t, n: i32) -> int64x2_t;
 }
 
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    transmute(i8x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    transmute(i8x16::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+        *ptr.offset(8),
+        *ptr.offset(9),
+        *ptr.offset(10),
+        *ptr.offset(11),
+        *ptr.offset(12),
+        *ptr.offset(13),
+        *ptr.offset(14),
+        *ptr.offset(15),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    transmute(i16x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    transmute(i16x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    transmute(i32x2::new(*ptr, *ptr.offset(1)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    transmute(i32x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    transmute(i64x1::new(*ptr))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    transmute(i64x2::new(*ptr, *ptr.offset(1)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(u8x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(u8x16::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+        *ptr.offset(8),
+        *ptr.offset(9),
+        *ptr.offset(10),
+        *ptr.offset(11),
+        *ptr.offset(12),
+        *ptr.offset(13),
+        *ptr.offset(14),
+        *ptr.offset(15),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(u16x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(u16x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(u32x2::new(*ptr, *ptr.offset(1)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(u32x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(u64x1::new(*ptr))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(u64x2::new(*ptr, *ptr.offset(1)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(u8x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(u8x16::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+        *ptr.offset(8),
+        *ptr.offset(9),
+        *ptr.offset(10),
+        *ptr.offset(11),
+        *ptr.offset(12),
+        *ptr.offset(13),
+        *ptr.offset(14),
+        *ptr.offset(15),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(u16x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(u16x8::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+        *ptr.offset(4),
+        *ptr.offset(5),
+        *ptr.offset(6),
+        *ptr.offset(7),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    transmute(f32x2::new(*ptr, *ptr.offset(1)))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    transmute(f32x4::new(
+        *ptr,
+        *ptr.offset(1),
+        *ptr.offset(2),
+        *ptr.offset(3),
+    ))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1_f64(ptr: *const f64) -> float64x1_t {
+    transmute(f64x1::new(*ptr))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(test, assert_instr(ldr))]
+pub unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t {
+    transmute(f64x2::new(*ptr, *ptr.offset(1)))
+}
+
 /// Absolute Value (wrapping).
 #[inline]
 #[target_feature(enable = "neon")]
@@ -656,7 +975,7 @@ pub unsafe fn vaddvq_u64(a: uint64x2_t) -> u64 {
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(pmull))]
-pub unsafe fn vmull_p64(a: poly64_t, b: poly64_t) -> poly128_t {
+pub unsafe fn vmull_p64(a: p64, b: p64) -> p128 {
     transmute(vmull_p64_(transmute(a), transmute(b)))
 }
 
@@ -1338,7 +1657,6 @@ pub unsafe fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
-    use crate::core_arch::simd::i8x8;
     let r = vqtbx1_s8(a, vcombine_s8(b, zeroed()), transmute(c));
     let m: int8x8_t = simd_lt(c, transmute(i8x8::splat(8)));
     simd_select(m, r, a)
@@ -1350,7 +1668,6 @@ pub unsafe fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
-    use crate::core_arch::simd::u8x8;
     let r = vqtbx1_u8(a, vcombine_u8(b, zeroed()), c);
     let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(8)));
     simd_select(m, r, a)
@@ -1362,7 +1679,6 @@ pub unsafe fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t {
-    use crate::core_arch::simd::u8x8;
     let r = vqtbx1_p8(a, vcombine_p8(b, zeroed()), c);
     let m: int8x8_t = simd_lt(c, transmute(u8x8::splat(8)));
     simd_select(m, r, a)
@@ -1401,7 +1717,6 @@ pub unsafe fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
-    use crate::core_arch::simd::i8x8;
     let r = vqtbx2_s8(
         a,
         int8x16x2_t(vcombine_s8(b.0, b.1), vcombine_s8(b.2, zeroed())),
@@ -1417,7 +1732,6 @@ pub unsafe fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t {
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t {
-    use crate::core_arch::simd::u8x8;
     let r = vqtbx2_u8(
         a,
         uint8x16x2_t(vcombine_u8(b.0, b.1), vcombine_u8(b.2, zeroed())),
@@ -1433,7 +1747,6 @@ pub unsafe fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(tbx))]
 pub unsafe fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t {
-    use crate::core_arch::simd::u8x8;
     let r = vqtbx2_p8(
         a,
         poly8x16x2_t(vcombine_p8(b.0, b.1), vcombine_p8(b.2, zeroed())),
@@ -1986,45 +2299,6 @@ pub unsafe fn vqtbx4q_p8(a: poly8x16_t, t: poly8x16x4_t, idx: uint8x16_t) -> pol
     ))
 }
 
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(ldr))]
-pub unsafe fn vld1q_f32(addr: *const f32) -> float32x4_t {
-    use crate::core_arch::simd::f32x4;
-    transmute(f32x4::new(
-        *addr,
-        *addr.offset(1),
-        *addr.offset(2),
-        *addr.offset(3),
-    ))
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(ldr))]
-pub unsafe fn vld1q_s32(addr: *const i32) -> int32x4_t {
-    use crate::core_arch::simd::i32x4;
-    transmute(i32x4::new(
-        *addr,
-        *addr.offset(1),
-        *addr.offset(2),
-        *addr.offset(3),
-    ))
-}
-
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(test, assert_instr(ldr))]
-pub unsafe fn vld1q_u32(addr: *const u32) -> uint32x4_t {
-    use crate::core_arch::simd::u32x4;
-    transmute(u32x4::new(
-        *addr,
-        *addr.offset(1),
-        *addr.offset(2),
-        *addr.offset(3),
-    ))
-}
-
 #[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fcvtzs))]
@@ -2446,36 +2720,6 @@ mod tests {
         assert_eq!(r, e);
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32() {
-        let e = f32x4::new(1., 2., 3., 4.);
-        let f = [0., 1., 2., 3., 4.];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: f32x4 = transmute(vld1q_f32(f[1..].as_ptr()));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32() {
-        let e = i32x4::new(1, 2, 3, 4);
-        let f = [0, 1, 2, 3, 4];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: i32x4 = transmute(vld1q_s32(f[1..].as_ptr()));
-        assert_eq!(r, e);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32() {
-        let e = u32x4::new(1, 2, 3, 4);
-        let f = [0, 1, 2, 3, 4];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: u32x4 = transmute(vld1q_u32(f[1..].as_ptr()));
-        assert_eq!(r, e);
-    }
-
     #[simd_test(enable = "neon")]
     unsafe fn test_vuqadd_s8() {
         let a = i8x8::new(i8::MIN, -3, -2, -1, 0, 1, 2, i8::MAX);
@@ -3775,3 +4019,7 @@ mod table_lookup_tests;
 #[cfg(test)]
 #[path = "../../arm/neon/shift_and_insert_tests.rs"]
 mod shift_and_insert_tests;
+
+#[cfg(test)]
+#[path = "../../arm/neon/load_tests.rs"]
+mod load_tests;
diff --git a/crates/core_arch/src/arm/neon/load_tests.rs b/crates/core_arch/src/arm/neon/load_tests.rs
new file mode 100644
index 0000000000..82e2f74955
--- /dev/null
+++ b/crates/core_arch/src/arm/neon/load_tests.rs
@@ -0,0 +1,208 @@
+//! Tests for ARM+v7+neon load (vld1) intrinsics.
+//!
+//! These are included in `{arm, aarch64}::neon`.
+
+use super::*;
+
+#[cfg(target_arch = "arm")]
+use crate::core_arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use crate::core_arch::aarch64::*;
+
+use crate::core_arch::simd::*;
+use std::mem;
+use stdarch_test::simd_test;
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s8() {
+    let a: [i8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i8x8 = transmute(vld1_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s8() {
+    let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: i8x16 = transmute(vld1q_s8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s16() {
+    let a: [i16; 5] = [0, 1, 2, 3, 4];
+    let e = i16x4::new(1, 2, 3, 4);
+    let r: i16x4 = transmute(vld1_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s16() {
+    let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: i16x8 = transmute(vld1q_s16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s32() {
+    let a: [i32; 3] = [0, 1, 2];
+    let e = i32x2::new(1, 2);
+    let r: i32x2 = transmute(vld1_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s32() {
+    let a: [i32; 5] = [0, 1, 2, 3, 4];
+    let e = i32x4::new(1, 2, 3, 4);
+    let r: i32x4 = transmute(vld1q_s32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_s64() {
+    let a: [i64; 2] = [0, 1];
+    let e = i64x1::new(1);
+    let r: i64x1 = transmute(vld1_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_s64() {
+    let a: [i64; 3] = [0, 1, 2];
+    let e = i64x2::new(1, 2);
+    let r: i64x2 = transmute(vld1q_s64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u8() {
+    let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u8() {
+    let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_u8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u16() {
+    let a: [u16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u16() {
+    let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_u16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u32() {
+    let a: [u32; 3] = [0, 1, 2];
+    let e = u32x2::new(1, 2);
+    let r: u32x2 = transmute(vld1_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u32() {
+    let a: [u32; 5] = [0, 1, 2, 3, 4];
+    let e = u32x4::new(1, 2, 3, 4);
+    let r: u32x4 = transmute(vld1q_u32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_u64() {
+    let a: [u64; 2] = [0, 1];
+    let e = u64x1::new(1);
+    let r: u64x1 = transmute(vld1_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_u64() {
+    let a: [u64; 3] = [0, 1, 2];
+    let e = u64x2::new(1, 2);
+    let r: u64x2 = transmute(vld1q_u64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p8() {
+    let a: [p8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u8x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u8x8 = transmute(vld1_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p8() {
+    let a: [p8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    let e = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
+    let r: u8x16 = transmute(vld1q_p8(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_p16() {
+    let a: [p16; 5] = [0, 1, 2, 3, 4];
+    let e = u16x4::new(1, 2, 3, 4);
+    let r: u16x4 = transmute(vld1_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_p16() {
+    let a: [p16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8];
+    let e = u16x8::new(1, 2, 3, 4, 5, 6, 7, 8);
+    let r: u16x8 = transmute(vld1q_p16(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_f32() {
+    let a: [f32; 3] = [0., 1., 2.];
+    let e = f32x2::new(1., 2.);
+    let r: f32x2 = transmute(vld1_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_f32() {
+    let a: [f32; 5] = [0., 1., 2., 3., 4.];
+    let e = f32x4::new(1., 2., 3., 4.);
+    let r: f32x4 = transmute(vld1q_f32(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[cfg(target_arch = "aarch64")]
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1_f64() {
+    let a: [f64; 2] = [0., 1.];
+    let e = f64x1::new(1.);
+    let r: f64x1 = transmute(vld1_f64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
+
+#[cfg(target_arch = "aarch64")]
+#[simd_test(enable = "neon")]
+unsafe fn test_vld1q_f64() {
+    let a: [f64; 3] = [0., 1., 2.];
+    let e = f64x2::new(1., 2.);
+    let r: f64x2 = transmute(vld1q_f64(a[1..].as_ptr()));
+    assert_eq!(r, e)
+}
diff --git a/crates/core_arch/src/arm/neon/mod.rs b/crates/core_arch/src/arm/neon/mod.rs
index bc67bc1a6d..8dae4b4d8b 100644
--- a/crates/core_arch/src/arm/neon/mod.rs
+++ b/crates/core_arch/src/arm/neon/mod.rs
@@ -5,25 +5,34 @@ mod generated;
 #[rustfmt::skip]
 pub use self::generated::*;
 
-use crate::{core_arch::simd_llvm::*, hint::unreachable_unchecked, mem::transmute, ptr};
+#[cfg(target_arch = "arm")]
+use crate::mem::align_of;
+use crate::{
+    core_arch::simd::*, core_arch::simd_llvm::*, hint::unreachable_unchecked, mem::transmute,
+};
 #[cfg(test)]
 use stdarch_test::assert_instr;
 
+pub(crate) type p8 = u8;
+pub(crate) type p16 = u16;
+pub(crate) type p64 = u64;
+pub(crate) type p128 = u128;
+
 types! {
     /// ARM-specific 64-bit wide vector of eight packed `i8`.
     pub struct int8x8_t(i8, i8, i8, i8, i8, i8, i8, i8);
     /// ARM-specific 64-bit wide vector of eight packed `u8`.
     pub struct uint8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
-    /// ARM-specific 64-bit wide polynomial vector of eight packed `u8`.
-    pub struct poly8x8_t(u8, u8, u8, u8, u8, u8, u8, u8);
+    /// ARM-specific 64-bit wide polynomial vector of eight packed `p8`.
+    pub struct poly8x8_t(p8, p8, p8, p8, p8, p8, p8, p8);
     /// ARM-specific 64-bit wide vector of four packed `i16`.
     pub struct int16x4_t(i16, i16, i16, i16);
     /// ARM-specific 64-bit wide vector of four packed `u16`.
     pub struct uint16x4_t(u16, u16, u16, u16);
     // FIXME: ARM-specific 64-bit wide vector of four packed `f16`.
     // pub struct float16x4_t(f16, f16, f16, f16);
-    /// ARM-specific 64-bit wide vector of four packed `u16`.
-    pub struct poly16x4_t(u16, u16, u16, u16);
+    /// ARM-specific 64-bit wide vector of four packed `p16`.
+    pub struct poly16x4_t(p16, p16, p16, p16);
     /// ARM-specific 64-bit wide vector of two packed `i32`.
     pub struct int32x2_t(i32, i32);
     /// ARM-specific 64-bit wide vector of two packed `u32`.
@@ -34,21 +43,23 @@ types! {
     pub struct int64x1_t(i64);
     /// ARM-specific 64-bit wide vector of one packed `u64`.
     pub struct uint64x1_t(u64);
+    /// ARM-specific 64-bit wide vector of one packed `p64`.
+    pub struct poly64x1_t(p64);
 
     /// ARM-specific 128-bit wide vector of sixteen packed `i8`.
     pub struct int8x16_t(
-        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
-        i8, i8 ,i8, i8, i8, i8 ,i8, i8,
+        i8, i8, i8, i8, i8, i8 ,i8, i8,
+        i8, i8, i8, i8, i8, i8 ,i8, i8,
     );
     /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
     pub struct uint8x16_t(
         u8, u8 ,u8, u8, u8, u8 ,u8, u8,
         u8, u8 ,u8, u8, u8, u8 ,u8, u8,
     );
-    /// ARM-specific 128-bit wide vector of sixteen packed `u8`.
+    /// ARM-specific 128-bit wide vector of sixteen packed `p8`.
     pub struct poly8x16_t(
-        u8, u8, u8, u8, u8, u8, u8, u8,
-        u8, u8, u8, u8, u8, u8, u8, u8
+        p8, p8, p8, p8, p8, p8, p8, p8,
+        p8, p8, p8, p8, p8, p8, p8, p8,
     );
     /// ARM-specific 128-bit wide vector of eight packed `i16`.
     pub struct int16x8_t(i16, i16, i16, i16, i16, i16, i16, i16);
@@ -56,8 +67,8 @@ types! {
     pub struct uint16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
     // FIXME: ARM-specific 128-bit wide vector of eight packed `f16`.
     // pub struct float16x8_t(f16, f16, f16, f16, f16, f16, f16);
-    /// ARM-specific 128-bit wide vector of eight packed `u16`.
-    pub struct poly16x8_t(u16, u16, u16, u16, u16, u16, u16, u16);
+    /// ARM-specific 128-bit wide vector of eight packed `p16`.
+    pub struct poly16x8_t(p16, p16, p16, p16, p16, p16, p16, p16);
     /// ARM-specific 128-bit wide vector of four packed `i32`.
     pub struct int32x4_t(i32, i32, i32, i32);
     /// ARM-specific 128-bit wide vector of four packed `u32`.
@@ -68,6 +79,8 @@ types! {
     pub struct int64x2_t(i64, i64);
     /// ARM-specific 128-bit wide vector of two packed `u64`.
     pub struct uint64x2_t(u64, u64);
+    /// ARM-specific 128-bit wide vector of two packed `p64`.
+    pub struct poly64x2_t(p64, p64);
 }
 
 /// ARM-specific type containing two `int8x8_t` vectors.
@@ -341,10 +354,6 @@ extern "C" {
         d: int8x8_t,
         e: int8x8_t,
     ) -> int8x8_t;
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4f32.p0i8")]
-    fn vld1q_v4f32(addr: *const u8, align: u32) -> float32x4_t;
-    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4i32.p0i8")]
-    fn vld1q_v4i32(addr: *const u8, align: u32) -> int32x4_t;
 
     #[link_name = "llvm.arm.neon.vshiftins.v8i8"]
     fn vshiftins_v8i8(a: int8x8_t, b: int8x8_t, shift: int8x8_t) -> int8x8_t;
@@ -362,6 +371,821 @@ extern "C" {
     fn vshiftins_v1i64(a: int64x1_t, b: int64x1_t, shift: int64x1_t) -> int64x1_t;
     #[link_name = "llvm.arm.neon.vshiftins.v2i64"]
     fn vshiftins_v2i64(a: int64x2_t, b: int64x2_t, shift: int64x2_t) -> int64x2_t;
+
+    #[link_name = "llvm.arm.neon.vld1.v8i8.p0i8"]
+    fn vld1_v8i8(addr: *const i8, align: i32) -> int8x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v16i8.p0i8"]
+    fn vld1q_v16i8(addr: *const i8, align: i32) -> int8x16_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i16.p0i8"]
+    fn vld1_v4i16(addr: *const i8, align: i32) -> int16x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v8i16.p0i8"]
+    fn vld1q_v8i16(addr: *const i8, align: i32) -> int16x8_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i32.p0i8"]
+    fn vld1_v2i32(addr: *const i8, align: i32) -> int32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4i32.p0i8"]
+    fn vld1q_v4i32(addr: *const i8, align: i32) -> int32x4_t;
+    #[link_name = "llvm.arm.neon.vld1.v1i64.p0i8"]
+    fn vld1_v1i64(addr: *const i8, align: i32) -> int64x1_t;
+    #[link_name = "llvm.arm.neon.vld1.v2i64.p0i8"]
+    fn vld1q_v2i64(addr: *const i8, align: i32) -> int64x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v2f32.p0i8"]
+    fn vld1_v2f32(addr: *const i8, align: i32) -> float32x2_t;
+    #[link_name = "llvm.arm.neon.vld1.v4f32.p0i8"]
+    fn vld1q_v4f32(addr: *const i8, align: i32) -> float32x4_t;
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
+    vld1_v8i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
+    vld1q_v16i8(ptr as *const i8, align_of::<i8>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
+    vld1_v4i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
+    vld1q_v8i16(ptr as *const i8, align_of::<i16>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
+    vld1_v2i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
+    vld1q_v4i32(ptr as *const i8, align_of::<i32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
+    vld1_v1i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
+    vld1q_v2i64(ptr as *const i8, align_of::<i64>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<u8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<u16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
+    transmute(vld1_v2i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
+    transmute(vld1q_v4i32(ptr as *const i8, align_of::<u32>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
+    transmute(vld1_v1i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.64"))]
+pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
+    transmute(vld1q_v2i64(ptr as *const i8, align_of::<u64>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
+    transmute(vld1_v8i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.8"))]
+pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
+    transmute(vld1q_v16i8(ptr as *const i8, align_of::<p8>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
+    transmute(vld1_v4i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.16"))]
+pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
+    transmute(vld1q_v8i16(ptr as *const i8, align_of::<p16>() as i32))
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr(vldr))]
+pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
+    vld1_v2f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Load multiple single-element structures to one, two, three, or four registers.
+#[inline]
+#[cfg(target_arch = "arm")]
+#[target_feature(enable = "neon,v7")]
+#[cfg_attr(test, assert_instr("vld1.32"))]
+pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
+    vld1q_v4f32(ptr as *const i8, align_of::<f32>() as i32)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1_lane_s8(ptr: *const i8, src: int8x8_t, lane: i32) -> int8x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 15))]
+pub unsafe fn vld1q_lane_s8(ptr: *const i8, src: int8x16_t, lane: i32) -> int8x16_t {
+    assert!(
+        0 <= lane && lane <= 15,
+        "must have 0 ≤ lane ≤ 15, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1_lane_s16(ptr: *const i16, src: int16x4_t, lane: i32) -> int16x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1q_lane_s16(ptr: *const i16, src: int16x8_t, lane: i32) -> int16x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 1))]
+pub unsafe fn vld1_lane_s32(ptr: *const i32, src: int32x2_t, lane: i32) -> int32x2_t {
+    assert!(
+        0 <= lane && lane <= 1,
+        "must have 0 ≤ lane ≤ 1, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1q_lane_s32(ptr: *const i32, src: int32x4_t, lane: i32) -> int32x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", lane = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, lane = 0))]
+pub unsafe fn vld1_lane_s64(ptr: *const i64, src: int64x1_t, lane: i32) -> int64x1_t {
+    assert!(
+        0 <= lane && lane <= 0,
+        "must have 0 ≤ lane ≤ 0, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", lane = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 1))]
+pub unsafe fn vld1q_lane_s64(ptr: *const i64, src: int64x2_t, lane: i32) -> int64x2_t {
+    assert!(
+        0 <= lane && lane <= 1,
+        "must have 0 ≤ lane ≤ 1, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1_lane_u8(ptr: *const u8, src: uint8x8_t, lane: i32) -> uint8x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 15))]
+pub unsafe fn vld1q_lane_u8(ptr: *const u8, src: uint8x16_t, lane: i32) -> uint8x16_t {
+    assert!(
+        0 <= lane && lane <= 15,
+        "must have 0 ≤ lane ≤ 15, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1_lane_u16(ptr: *const u16, src: uint16x4_t, lane: i32) -> uint16x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1q_lane_u16(ptr: *const u16, src: uint16x8_t, lane: i32) -> uint16x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 1))]
+pub unsafe fn vld1_lane_u32(ptr: *const u32, src: uint32x2_t, lane: i32) -> uint32x2_t {
+    assert!(
+        0 <= lane && lane <= 1,
+        "must have 0 ≤ lane ≤ 1, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1q_lane_u32(ptr: *const u32, src: uint32x4_t, lane: i32) -> uint32x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", lane = 0))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr, lane = 0))]
+pub unsafe fn vld1_lane_u64(ptr: *const u64, src: uint64x1_t, lane: i32) -> uint64x1_t {
+    assert!(
+        0 <= lane && lane <= 0,
+        "must have 0 ≤ lane ≤ 0, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr", lane = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 1))]
+pub unsafe fn vld1q_lane_u64(ptr: *const u64, src: uint64x2_t, lane: i32) -> uint64x2_t {
+    assert!(
+        0 <= lane && lane <= 1,
+        "must have 0 ≤ lane ≤ 1, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1_lane_p8(ptr: *const p8, src: poly8x8_t, lane: i32) -> poly8x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8", lane = 15))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 15))]
+pub unsafe fn vld1q_lane_p8(ptr: *const p8, src: poly8x16_t, lane: i32) -> poly8x16_t {
+    assert!(
+        0 <= lane && lane <= 15,
+        "must have 0 ≤ lane ≤ 15, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1_lane_p16(ptr: *const p16, src: poly16x4_t, lane: i32) -> poly16x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16", lane = 7))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 7))]
+pub unsafe fn vld1q_lane_p16(ptr: *const p16, src: poly16x8_t, lane: i32) -> poly16x8_t {
+    assert!(
+        0 <= lane && lane <= 7,
+        "must have 0 ≤ lane ≤ 7, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 1))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 1))]
+pub unsafe fn vld1_lane_f32(ptr: *const f32, src: float32x2_t, lane: i32) -> float32x2_t {
+    assert!(
+        0 <= lane && lane <= 1,
+        "must have 0 ≤ lane ≤ 1, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure to one lane of one register.
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[rustc_args_required_const(2)]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32", lane = 3))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1, lane = 3))]
+pub unsafe fn vld1q_lane_f32(ptr: *const f32, src: float32x4_t, lane: i32) -> float32x4_t {
+    assert!(
+        0 <= lane && lane <= 3,
+        "must have 0 ≤ lane ≤ 3, but lane = {}",
+        lane
+    );
+    simd_insert(src, lane as u32, *ptr)
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t {
+    let x = vld1_lane_s8(ptr, transmute(i8x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t {
+    let x = vld1q_lane_s8(ptr, transmute(i8x16::splat(0)), 0);
+    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t {
+    let x = vld1_lane_s16(ptr, transmute(i16x4::splat(0)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t {
+    let x = vld1q_lane_s16(ptr, transmute(i16x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t {
+    let x = vld1_lane_s32(ptr, transmute(i32x2::splat(0)), 0);
+    simd_shuffle2(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t {
+    let x = vld1q_lane_s32(ptr, transmute(i32x4::splat(0)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+pub unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t {
+    #[cfg(target_arch = "aarch64")]
+    use crate::core_arch::aarch64::vld1_s64;
+    vld1_s64(ptr)
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t {
+    let x = vld1q_lane_s64(ptr, transmute(i64x2::splat(0)), 0);
+    simd_shuffle2(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t {
+    let x = vld1_lane_u8(ptr, transmute(u8x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t {
+    let x = vld1q_lane_u8(ptr, transmute(u8x16::splat(0)), 0);
+    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t {
+    let x = vld1_lane_u16(ptr, transmute(u16x4::splat(0)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t {
+    let x = vld1q_lane_u16(ptr, transmute(u16x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t {
+    let x = vld1_lane_u32(ptr, transmute(u32x2::splat(0)), 0);
+    simd_shuffle2(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t {
+    let x = vld1q_lane_u32(ptr, transmute(u32x4::splat(0)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ldr))]
+pub unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t {
+    #[cfg(target_arch = "aarch64")]
+    use crate::core_arch::aarch64::vld1_u64;
+    vld1_u64(ptr)
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vldr"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t {
+    let x = vld1q_lane_u64(ptr, transmute(u64x2::splat(0)), 0);
+    simd_shuffle2(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t {
+    let x = vld1_lane_p8(ptr, transmute(u8x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.8"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t {
+    let x = vld1q_lane_p8(ptr, transmute(u8x16::splat(0)), 0);
+    simd_shuffle16(x, x, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t {
+    let x = vld1_lane_p16(ptr, transmute(u16x4::splat(0)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.16"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t {
+    let x = vld1q_lane_p16(ptr, transmute(u16x8::splat(0)), 0);
+    simd_shuffle8(x, x, [0, 0, 0, 0, 0, 0, 0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t {
+    let x = vld1_lane_f32(ptr, transmute(f32x2::splat(0.)), 0);
+    simd_shuffle2(x, x, [0, 0])
+}
+
+/// Load one single-element structure and Replicate to all lanes (of one register).
+#[inline]
+#[target_feature(enable = "neon")]
+#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
+#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
+#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
+pub unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t {
+    let x = vld1q_lane_f32(ptr, transmute(f32x4::splat(0.)), 0);
+    simd_shuffle4(x, x, [0, 0, 0, 0])
 }
 
 /// Absolute value (wrapping).
@@ -2686,70 +3510,6 @@ pub unsafe fn vextq_u8(a: uint8x16_t, b: uint8x16_t, n: i32) -> uint8x16_t {
     }
 }
 
-/// Load multiple single-element structures to one, two, three, or four registers
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(test, assert_instr(ldr))]
-// even gcc compiles this to ldr: https://clang.godbolt.org/z/1bvH2x
-// #[cfg_attr(test, assert_instr(ld1))]
-pub unsafe fn vld1q_s8(addr: *const i8) -> int8x16_t {
-    ptr::read(addr as *const int8x16_t)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(test, assert_instr(ldr))]
-// even gcc compiles this to ldr: https://clang.godbolt.org/z/1bvH2x
-// #[cfg_attr(test, assert_instr(ld1))]
-pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t {
-    ptr::read(addr as *const uint8x16_t)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_s32(addr: *const i32) -> int32x4_t {
-    vld1q_v4i32(addr as *const u8, 4)
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_u32(addr: *const u32) -> uint32x4_t {
-    transmute(vld1q_v4i32(addr as *const u8, 4))
-}
-
-/// Load multiple single-element structures to one, two, three, or four registers
-#[inline]
-#[cfg(target_arch = "arm")]
-#[target_feature(enable = "neon")]
-#[target_feature(enable = "v7")]
-#[cfg_attr(test, assert_instr("vld1.32"))]
-pub unsafe fn vld1q_f32(addr: *const f32) -> float32x4_t {
-    vld1q_v4f32(addr as *const u8, 4)
-}
-
-/// Load one single-element structure and Replicate to all lanes (of one register).
-#[inline]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr("vld1.32"))]
-#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld1r))]
-pub unsafe fn vld1q_dup_f32(addr: *const f32) -> float32x4_t {
-    use crate::core_arch::simd::f32x4;
-    let v = *addr;
-    transmute(f32x4::new(v, v, v, v))
-}
-
 // These float-to-int implementations have undefined behaviour when `a` overflows
 // the destination type. Clang has the same problem: https://llvm.org/PR47510
 
@@ -2760,7 +3520,6 @@ pub unsafe fn vld1q_dup_f32(addr: *const f32) -> float32x4_t {
 #[target_feature(enable = "v7")]
 #[cfg_attr(test, assert_instr("vcvt.s32.f32"))]
 pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
-    use crate::core_arch::simd::{f32x4, i32x4};
     transmute(simd_cast::<_, i32x4>(transmute::<_, f32x4>(a)))
 }
 
@@ -2771,7 +3530,6 @@ pub unsafe fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t {
 #[target_feature(enable = "v7")]
 #[cfg_attr(test, assert_instr("vcvt.u32.f32"))]
 pub unsafe fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t {
-    use crate::core_arch::simd::{f32x4, u32x4};
     transmute(simd_cast::<_, u32x4>(transmute::<_, f32x4>(a)))
 }
 
@@ -3385,60 +4143,383 @@ mod tests {
     use stdarch_test::simd_test;
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8() {
-        let a = i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e = a;
-        let r: i8x16 = transmute(vld1q_s8(transmute(&a)));
-        assert_eq!(r, e);
+    unsafe fn test_vld1_lane_s8() {
+        let a = i8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i8 = 42;
+        let e = i8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i8x8 = transmute(vld1_lane_s8(&elem, transmute(a), 7));
+        assert_eq!(r, e)
     }
 
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8() {
-        let a = u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
-        let e = a;
-        let r: u8x16 = transmute(vld1q_u8(transmute(&a)));
-        assert_eq!(r, e);
+    unsafe fn test_vld1q_lane_s8() {
+        let a = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: i8 = 42;
+        let e = i8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: i8x16 = transmute(vld1q_lane_s8(&elem, transmute(a), 15));
+        assert_eq!(r, e)
     }
 
-    #[cfg(target_arch = "arm")]
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32() {
-        let e = f32x4::new(1., 2., 3., 4.);
-        let f = [0., 1., 2., 3., 4.];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: f32x4 = transmute(vld1q_f32(f[1..].as_ptr()));
-        assert_eq!(r, e);
+    unsafe fn test_vld1_lane_s16() {
+        let a = i16x4::new(0, 1, 2, 3);
+        let elem: i16 = 42;
+        let e = i16x4::new(0, 1, 2, 42);
+        let r: i16x4 = transmute(vld1_lane_s16(&elem, transmute(a), 3));
+        assert_eq!(r, e)
     }
 
-    #[cfg(target_arch = "arm")]
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32() {
-        let e = i32x4::new(1, 2, 3, 4);
-        let f = [0, 1, 2, 3, 4];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: i32x4 = transmute(vld1q_s32(f[1..].as_ptr()));
-        assert_eq!(r, e);
+    unsafe fn test_vld1q_lane_s16() {
+        let a = i16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: i16 = 42;
+        let e = i16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: i16x8 = transmute(vld1q_lane_s16(&elem, transmute(a), 7));
+        assert_eq!(r, e)
     }
 
-    #[cfg(target_arch = "arm")]
     #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32() {
-        let e = u32x4::new(1, 2, 3, 4);
-        let f = [0, 1, 2, 3, 4];
-        // do a load that has 4 byte alignment to make sure we're not
-        // over aligning it
-        let r: u32x4 = transmute(vld1q_u32(f[1..].as_ptr()));
-        assert_eq!(r, e);
+    unsafe fn test_vld1_lane_s32() {
+        let a = i32x2::new(0, 1);
+        let elem: i32 = 42;
+        let e = i32x2::new(0, 42);
+        let r: i32x2 = transmute(vld1_lane_s32(&elem, transmute(a), 1));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s32() {
+        let a = i32x4::new(0, 1, 2, 3);
+        let elem: i32 = 42;
+        let e = i32x4::new(0, 1, 2, 42);
+        let r: i32x4 = transmute(vld1q_lane_s32(&elem, transmute(a), 3));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_s64() {
+        let a = i64x1::new(0);
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_lane_s64(&elem, transmute(a), 0));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_s64() {
+        let a = i64x2::new(0, 1);
+        let elem: i64 = 42;
+        let e = i64x2::new(0, 42);
+        let r: i64x2 = transmute(vld1q_lane_s64(&elem, transmute(a), 1));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_u8(&elem, transmute(a), 7));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: u8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_u8(&elem, transmute(a), 15));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: u16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_u16(&elem, transmute(a), 3));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: u16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_u16(&elem, transmute(a), 7));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u32() {
+        let a = u32x2::new(0, 1);
+        let elem: u32 = 42;
+        let e = u32x2::new(0, 42);
+        let r: u32x2 = transmute(vld1_lane_u32(&elem, transmute(a), 1));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u32() {
+        let a = u32x4::new(0, 1, 2, 3);
+        let elem: u32 = 42;
+        let e = u32x4::new(0, 1, 2, 42);
+        let r: u32x4 = transmute(vld1q_lane_u32(&elem, transmute(a), 3));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_u64() {
+        let a = u64x1::new(0);
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_lane_u64(&elem, transmute(a), 0));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_u64() {
+        let a = u64x2::new(0, 1);
+        let elem: u64 = 42;
+        let e = u64x2::new(0, 42);
+        let r: u64x2 = transmute(vld1q_lane_u64(&elem, transmute(a), 1));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p8() {
+        let a = u8x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p8 = 42;
+        let e = u8x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u8x8 = transmute(vld1_lane_p8(&elem, transmute(a), 7));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p8() {
+        let a = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let elem: p8 = 42;
+        let e = u8x16::new(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 42);
+        let r: u8x16 = transmute(vld1q_lane_p8(&elem, transmute(a), 15));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_p16() {
+        let a = u16x4::new(0, 1, 2, 3);
+        let elem: p16 = 42;
+        let e = u16x4::new(0, 1, 2, 42);
+        let r: u16x4 = transmute(vld1_lane_p16(&elem, transmute(a), 3));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_p16() {
+        let a = u16x8::new(0, 1, 2, 3, 4, 5, 6, 7);
+        let elem: p16 = 42;
+        let e = u16x8::new(0, 1, 2, 3, 4, 5, 6, 42);
+        let r: u16x8 = transmute(vld1q_lane_p16(&elem, transmute(a), 7));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_lane_f32() {
+        let a = f32x2::new(0., 1.);
+        let elem: f32 = 42.;
+        let e = f32x2::new(0., 42.);
+        let r: f32x2 = transmute(vld1_lane_f32(&elem, transmute(a), 1));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_lane_f32() {
+        let a = f32x4::new(0., 1., 2., 3.);
+        let elem: f32 = 42.;
+        let e = f32x4::new(0., 1., 2., 42.);
+        let r: f32x4 = transmute(vld1q_lane_f32(&elem, transmute(a), 3));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i8x8 = transmute(vld1_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s8() {
+        let elem: i8 = 42;
+        let e = i8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: i8x16 = transmute(vld1q_dup_s8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x4::new(42, 42, 42, 42);
+        let r: i16x4 = transmute(vld1_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s16() {
+        let elem: i16 = 42;
+        let e = i16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: i16x8 = transmute(vld1q_dup_s16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x2::new(42, 42);
+        let r: i32x2 = transmute(vld1_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s32() {
+        let elem: i32 = 42;
+        let e = i32x4::new(42, 42, 42, 42);
+        let r: i32x4 = transmute(vld1q_dup_s32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x1::new(42);
+        let r: i64x1 = transmute(vld1_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_s64() {
+        let elem: i64 = 42;
+        let e = i64x2::new(42, 42);
+        let r: i64x2 = transmute(vld1q_dup_s64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u8() {
+        let elem: u8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_u8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u16() {
+        let elem: u16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_u16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x2::new(42, 42);
+        let r: u32x2 = transmute(vld1_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u32() {
+        let elem: u32 = 42;
+        let e = u32x4::new(42, 42, 42, 42);
+        let r: u32x4 = transmute(vld1q_dup_u32(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x1::new(42);
+        let r: u64x1 = transmute(vld1_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_u64() {
+        let elem: u64 = 42;
+        let e = u64x2::new(42, 42);
+        let r: u64x2 = transmute(vld1q_dup_u64(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u8x8 = transmute(vld1_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p8() {
+        let elem: p8 = 42;
+        let e = u8x16::new(
+            42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        );
+        let r: u8x16 = transmute(vld1q_dup_p8(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x4::new(42, 42, 42, 42);
+        let r: u16x4 = transmute(vld1_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_dup_p16() {
+        let elem: p16 = 42;
+        let e = u16x8::new(42, 42, 42, 42, 42, 42, 42, 42);
+        let r: u16x8 = transmute(vld1q_dup_p16(&elem));
+        assert_eq!(r, e)
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_dup_f32() {
+        let elem: f32 = 42.;
+        let e = f32x2::new(42., 42.);
+        let r: f32x2 = transmute(vld1_dup_f32(&elem));
+        assert_eq!(r, e)
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_dup_f32() {
-        let e = f32x4::new(1., 1., 1., 1.);
-        let f = [1., 2., 3., 4.];
-        let r: f32x4 = transmute(vld1q_dup_f32(f.as_ptr()));
-        assert_eq!(r, e);
+        let elem: f32 = 42.;
+        let e = f32x4::new(42., 42., 42., 42.);
+        let r: f32x4 = transmute(vld1q_dup_f32(&elem));
+        assert_eq!(r, e)
     }
 
     #[cfg(target_arch = "arm")]
@@ -6710,3 +7791,6 @@ mod table_lookup_tests;
 
 #[cfg(all(test, target_arch = "arm"))]
 mod shift_and_insert_tests;
+
+#[cfg(all(test, target_arch = "arm"))]
+mod load_tests;
diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 344adfe59a..de1482f322 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -133,6 +133,7 @@ simd_ty!(i32x2[i32]: i32, i32 | x0, x1);
 simd_ty!(i64x1[i64]: i64 | x1);
 
 simd_ty!(f32x2[f32]: f32, f32 | x0, x1);
+simd_ty!(f64x1[f64]: f64 | x1);
 
 // 128-bit wide types:
 
diff --git a/crates/stdarch-verify/src/lib.rs b/crates/stdarch-verify/src/lib.rs
index 1682654d95..412e7f6c8b 100644
--- a/crates/stdarch-verify/src/lib.rs
+++ b/crates/stdarch-verify/src/lib.rs
@@ -164,6 +164,8 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
             "u64" => quote! { &U64 },
             "u128" => quote! { &U128 },
             "u8" => quote! { &U8 },
+            "p8" => quote! { &P8 },
+            "p16" => quote! { &P16 },
             "Ordering" => quote! { &ORDERING },
             "CpuidResult" => quote! { &CPUID },
 
@@ -209,13 +211,13 @@ fn to_type(t: &syn::Type) -> proc_macro2::TokenStream {
             "poly8x16x2_t" => quote! { &POLY8X16X2 },
             "poly8x16x3_t" => quote! { &POLY8X16X3 },
             "poly8x16x4_t" => quote! { &POLY8X16X4 },
-            "poly64_t" => quote! { &P64 },
+            "p64" => quote! { &P64 },
             "poly64x1_t" => quote! { &POLY64X1 },
             "poly64x2_t" => quote! { &POLY64X2 },
             "poly8x16_t" => quote! { &POLY8X16 },
             "poly16x4_t" => quote! { &POLY16X4 },
             "poly16x8_t" => quote! { &POLY16X8 },
-            "poly128_t" => quote! { &P128 },
+            "p128" => quote! { &P128 },
 
             "v16i8" => quote! { &v16i8 },
             "v8i16" => quote! { &v8i16 },