-
Notifications
You must be signed in to change notification settings - Fork 299
Implement SSE _mm_load* instructions #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
9a38a18
71ce86b
97e963a
26ac082
ef12517
a487ad1
1f8490a
c4798eb
0d9b9e5
ab46c33
11d4470
b28a542
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
use simd_llvm::simd_shuffle4; | ||
use v128::*; | ||
use v64::f32x2; | ||
use std::os::raw::c_void; | ||
use std::mem; | ||
use std::ptr; | ||
|
||
#[cfg(test)] | ||
use stdsimd_test::assert_instr; | ||
|
@@ -268,6 +271,190 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 { | |
movmskps(a) | ||
} | ||
|
||
/// Set the upper two single-precision floating-point values with 64 bits of | ||
/// data loaded from the address `p`; the lower two values are passed through | ||
/// from `a`. | ||
/// | ||
/// This corresponds to the `MOVHPS` / `MOVHPD` / `VMOVHPD` instructions. | ||
/// | ||
/// ```rust | ||
/// # #![feature(cfg_target_feature)] | ||
/// # #![feature(target_feature)] | ||
/// # | ||
/// # #[macro_use] extern crate stdsimd; | ||
/// # | ||
/// # // The real main function | ||
/// # fn main() { | ||
/// # if cfg_feature_enabled!("sse") { | ||
/// # #[target_feature = "+sse"] | ||
/// # fn worker() { | ||
/// # | ||
/// # use stdsimd::simd::f32x4; | ||
/// # use stdsimd::vendor::_mm_loadh_pi; | ||
/// # | ||
/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0); | ||
/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; | ||
/// | ||
/// let r = unsafe { _mm_loadh_pi(a, data[..].as_ptr()) }; | ||
/// | ||
/// assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); | ||
/// # | ||
/// # } | ||
/// # worker(); | ||
/// # } | ||
/// # } | ||
/// ``` | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
// TODO: generates MOVHPD if the CPU supports SSE2. | ||
// #[cfg_attr(test, assert_instr(movhps))] | ||
#[cfg_attr(test, assert_instr(movhpd))] | ||
// TODO: This function is actually not limited to floats, but that's what | ||
// what matches the C type most closely: (__m128, *const __m64) -> __m128 | ||
pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 { | ||
let q = p as *const f32x2; | ||
let b: f32x2 = *q; | ||
let bb = simd_shuffle4(b, b, [0, 1, 0, 1]); | ||
simd_shuffle4(a, bb, [0, 1, 4, 5]) | ||
} | ||
|
||
/// Load two floats from `p` into the lower half of a `f32x4`. The upper half | ||
/// is copied from the upper half of `a`. | ||
/// | ||
/// This corresponds to the `MOVLPS` / `MOVLDP` / `VMOVLDP` instructions. | ||
/// | ||
/// ```rust | ||
/// # #![feature(cfg_target_feature)] | ||
/// # #![feature(target_feature)] | ||
/// # | ||
/// # #[macro_use] extern crate stdsimd; | ||
/// # | ||
/// # // The real main function | ||
/// # fn main() { | ||
/// # if cfg_feature_enabled!("sse") { | ||
/// # #[target_feature = "+sse"] | ||
/// # fn worker() { | ||
/// # | ||
/// # use stdsimd::simd::f32x4; | ||
/// # use stdsimd::vendor::_mm_loadl_pi; | ||
/// # | ||
/// let a = f32x4::new(1.0, 2.0, 3.0, 4.0); | ||
/// let data: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; | ||
/// | ||
/// let r = unsafe { _mm_loadl_pi(a, data[..].as_ptr()) }; | ||
/// | ||
/// assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0)); | ||
/// # | ||
/// # } | ||
/// # worker(); | ||
/// # } | ||
/// # } | ||
/// ``` | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
// TODO: generates MOVLPD if the CPU supports SSE2. | ||
// #[cfg_attr(test, assert_instr(movlps))] | ||
#[cfg_attr(test, assert_instr(movlpd))] | ||
// TODO: Like _mm_loadh_pi, this also isn't limited to floats. | ||
pub unsafe fn _mm_loadl_pi(a: f32x4, p: *const f32) -> f32x4 { | ||
let q = p as *const f32x2; | ||
let b: f32x2 = *q; | ||
let bb = simd_shuffle4(b, b, [0, 1, 0, 1]); | ||
simd_shuffle4(a, bb, [4, 5, 2, 3]) | ||
} | ||
|
||
/// Construct a `f32x4` with the lowest element read from `p` and the other | ||
/// elements set to zero. | ||
/// | ||
/// This corresponds to instructions `VMOVSS` / `MOVSS`. | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movss))] | ||
pub unsafe fn _mm_load_ss(p: *const f32) -> f32x4 { | ||
f32x4::new(*p, 0.0, 0.0, 0.0) | ||
} | ||
|
||
/// Construct a `f32x4` by duplicating the value read from `p` into all | ||
/// elements. | ||
/// | ||
/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some | ||
/// shuffling. | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movss))] | ||
pub unsafe fn _mm_load1_ps(p: *const f32) -> f32x4 { | ||
let a = *p; | ||
f32x4::new(a, a, a, a) | ||
} | ||
|
||
/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movss))] | ||
pub unsafe fn _mm_load_ps1(p: *const f32) -> f32x4 { | ||
_mm_load1_ps(p) | ||
} | ||
|
||
/// Load four `f32` values from *aligned* memory into a `f32x4`. If the pointer | ||
/// is not aligned to a 128-bit boundary (16 bytes) a general protection fault | ||
/// will be triggered (fatal program crash). | ||
/// | ||
/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned memory. | ||
/// | ||
/// This corresponds to instructions `VMOVAPS` / `MOVAPS`. | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movaps))] | ||
pub unsafe fn _mm_load_ps(p: *const f32) -> f32x4 { | ||
*(p as *const f32x4) | ||
} | ||
|
||
/// Load four `f32` values from memory into a `f32x4`. There are no restrictions | ||
/// on memory alignment. For aligned memory [`_mm_load_ps`](fn._mm_load_ps.html) | ||
/// may be faster. | ||
/// | ||
/// This corresponds to instructions `VMOVUPS` / `MOVUPS`. | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movups))] | ||
pub unsafe fn _mm_loadu_ps(p: *const f32) -> f32x4 { | ||
// TODO: This also seems to generate the same code. Don't know which one | ||
// will behave better when inlined into other code. | ||
// f32x4::new(*p, *p.offset(1), *p.offset(2), *p.offset(3)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that if there's no restrictions on alignment here that the implementation below is the one we'll have to go with, AFAIK a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, OK, then I'll remove this TODO. |
||
let mut dst = mem::uninitialized(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this may be causing the faults seen on Travis, I had to use |
||
ptr::copy_nonoverlapping( | ||
p as *const u8, | ||
&mut dst as *mut f32x4 as *mut u8, | ||
mem::size_of::<f32x4>()); | ||
dst | ||
} | ||
|
||
/// Load four `f32` values from aligned memory into a `f32x4` in reverse order. | ||
/// | ||
/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | ||
/// protection fault will be triggered (fatal program crash). | ||
/// | ||
/// Functionally equivalent to the following code sequence (assuming `p` | ||
/// satisfies the alignment restrictions): | ||
/// | ||
/// ```text | ||
/// let a0 = *p; | ||
/// let a1 = *p.offset(1); | ||
/// let a2 = *p.offset(2); | ||
/// let a3 = *p.offset(3); | ||
/// f32x4::new(a3, a2, a1, a0) | ||
/// ``` | ||
/// | ||
/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some | ||
/// shuffling. | ||
#[inline(always)] | ||
#[target_feature = "+sse"] | ||
#[cfg_attr(test, assert_instr(movaps))] | ||
pub unsafe fn _mm_loadr_ps(p: *const f32) -> f32x4 { | ||
let a = _mm_load_ps(p); | ||
simd_shuffle4(a, a, [3, 2, 1, 0]) | ||
} | ||
|
||
/// Perform a serializing operation on all store-to-memory instructions that | ||
/// were issued prior to this instruction. | ||
/// | ||
|
@@ -829,6 +1016,88 @@ mod tests { | |
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_loadh_pi() { | ||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0); | ||
let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; | ||
let p = x[..].as_ptr(); | ||
let r = sse::_mm_loadh_pi(a, p); | ||
assert_eq!(r, f32x4::new(1.0, 2.0, 5.0, 6.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_loadl_pi() { | ||
let a = f32x4::new(1.0, 2.0, 3.0, 4.0); | ||
let x: [f32; 4] = [5.0, 6.0, 7.0, 8.0]; | ||
let p = x[..].as_ptr(); | ||
let r = sse::_mm_loadl_pi(a, p); | ||
assert_eq!(r, f32x4::new(5.0, 6.0, 3.0, 4.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_load_ss() { | ||
let a = 42.0f32; | ||
let r = sse::_mm_load_ss(&a as *const f32); | ||
assert_eq!(r, f32x4::new(42.0, 0.0, 0.0, 0.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_load1_ps() { | ||
let a = 42.0f32; | ||
let r = sse::_mm_load1_ps(&a as *const f32); | ||
assert_eq!(r, f32x4::new(42.0, 42.0, 42.0, 42.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_load_ps() { | ||
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | ||
|
||
let mut p = vals.as_ptr(); | ||
let mut fixup = 0.0f32; | ||
|
||
// Make sure p is aligned, otherwise we might get a | ||
// (signal: 11, SIGSEGV: invalid memory reference) | ||
|
||
let unalignment = (p as usize) & 0xf; | ||
if unalignment != 0 { | ||
let delta = ((16 - unalignment) >> 2) as isize; | ||
fixup = delta as f32; | ||
p = p.offset(delta); | ||
} | ||
|
||
let r = sse::_mm_load_ps(p); | ||
assert_eq!(r, f32x4::new(1.0, 2.0, 3.0, 4.0) + f32x4::splat(fixup)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_loadu_ps() { | ||
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | ||
let p = vals.as_ptr().offset(3); | ||
let r = sse::_mm_loadu_ps(black_box(p)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think another case of a possible crash here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, the failing tests are due to different instruction selection on i586 vs i686 for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops misread the logs! You can conditionalize for i586/i686 with |
||
assert_eq!(r, f32x4::new(4.0, 5.0, 6.0, 7.0)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_loadr_ps() { | ||
let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | ||
|
||
let mut p = vals.as_ptr(); | ||
let mut fixup = 0.0f32; | ||
|
||
// Make sure p is aligned, otherwise we might get a | ||
// (signal: 11, SIGSEGV: invalid memory reference) | ||
|
||
let unalignment = (p as usize) & 0xf; | ||
if unalignment != 0 { | ||
let delta = ((16 - unalignment) >> 2) as isize; | ||
fixup = delta as f32; | ||
p = p.offset(delta); | ||
} | ||
|
||
let r = sse::_mm_loadr_ps(p); | ||
assert_eq!(r, f32x4::new(4.0, 3.0, 2.0, 1.0) + f32x4::splat(fixup)); | ||
} | ||
|
||
#[simd_test = "sse"] | ||
unsafe fn _mm_movemask_ps() { | ||
let r = sse::_mm_movemask_ps(f32x4::new(-1.0, 5.0, -5.0, 0.0)); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this actually got me thinking! I remembered that x86_64 is defined with SSE2 support by default, which may explain this function on that target. It turns out as well, however, that i686 also has SSE2 turned on by default!
I think to assert this instruction we'd have to use the i586 target (which has sse/sse2 disabled), but in the meantime this is probably ok.