diff --git a/TODO.md b/TODO.md index 2764cd8d97..1373340a72 100644 --- a/TODO.md +++ b/TODO.md @@ -540,9 +540,9 @@ sse4.2 avx --- -* [ ] `_mm256_add_pd` -* [ ] `_mm256_add_ps` -* [ ] `_mm256_addsub_pd` +* [x] `_mm256_add_pd` +* [x] `_mm256_add_ps` +* [x] `_mm256_addsub_pd` * [ ] `_mm256_addsub_ps` * [ ] `_mm256_and_pd` * [ ] `_mm256_and_ps` diff --git a/src/x86/avx.rs b/src/x86/avx.rs new file mode 100644 index 0000000000..6ec764c377 --- /dev/null +++ b/src/x86/avx.rs @@ -0,0 +1,71 @@ +use v256::*; + +/// Add packed double-precision (64-bit) floating-point elements +/// in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_add_pd(a: f64x4, b: f64x4) -> f64x4 { + a + b +} + +/// Add packed single-precision (32-bit) floating-point elements in `a` and `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_add_ps(a: f32x8, b: f32x8) -> f32x8 { + a + b +} + +/// Alternatively add and subtract packed double-precision (64-bit) +/// floating-point elements in `a` to/from packed elements in `b`. +#[inline(always)] +#[target_feature = "+avx"] +pub fn _mm256_addsub_pd(a: f64x4, b: f64x4) -> f64x4 { + unsafe { addsubpd256(a, b) } +} + + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.avx.addsub.pd.256"] + fn addsubpd256(a: f64x4, b:f64x4) -> f64x4; +} + + +#[cfg(test)] +mod tests { + use v256::*; + use x86::avx; + + #[test] + #[target_feature = "+avx"] + fn _mm256_add_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_add_pd(a, b); + let e = f64x4::new(6.0, 8.0, 10.0, 12.0); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx"] + fn _mm256_add_ps() { + let a = f32x8::new(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0); + let b = f32x8::new(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0); + let r = avx::_mm256_add_ps(a, b); + let e = f32x8::new(10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0); + assert_eq!(r, e); + } + + #[test] + #[target_feature = "+avx"] + fn _mm256_addsub_pd() { + let a = f64x4::new(1.0, 2.0, 3.0, 4.0); + let b = f64x4::new(5.0, 6.0, 7.0, 8.0); + let r = avx::_mm256_addsub_pd(a, b); + let e = f64x4::new(-4.0,8.0,-4.0,12.0); + assert_eq!(r, e); + } + + + +} \ No newline at end of file diff --git a/src/x86/avx2.rs b/src/x86/avx2.rs index 7ec508231d..2840b0144f 100644 --- a/src/x86/avx2.rs +++ b/src/x86/avx2.rs @@ -568,7 +568,7 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); let b = i8x32::new( - 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 31, 30, 2, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); let r = avx2::_mm256_cmpeq_epi8(a, b); assert_eq!(r, i8x32::splat(0).replace(2,0xFFu8 as i8)); @@ -641,7 +641,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hadd_epi16(a, b); let e = i16x16::new(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -651,7 +651,7 @@ mod tests { let b = i32x8::splat(4); let r = avx2::_mm256_hadd_epi32(a, b); let e = i32x8::new(4, 4, 8, 8, 4, 4, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -662,7 +662,7 @@ mod tests { let r = avx2::_mm256_hadds_epi16(a, b); let e = i16x16::new( 0x7FFF, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -672,7 +672,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hsub_epi16(a, b); let e = i16x16::splat(0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -682,7 +682,7 @@ mod tests { let b = i32x8::splat(4); let r = avx2::_mm256_hsub_epi32(a, b); let e = i32x8::splat(0); - assert_eq!(r,e); + assert_eq!(r, e); } #[test] @@ -692,8 +692,7 @@ mod tests { let b = i16x16::splat(4); let r = avx2::_mm256_hsubs_epi16(a, b); let e = i16x16::splat(0).replace(0,0x7FFF); - assert_eq!(r,e); + assert_eq!(r, e); } - } diff --git a/src/x86/mod.rs b/src/x86/mod.rs index d36fa4444d..839b531556 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -2,6 +2,7 @@ pub use self::sse::*; pub use self::sse2::*; pub use self::ssse3::*; pub use self::sse42::*; +pub use self::avx::*; pub use self::avx2::*; #[allow(non_camel_case_types)] @@ -13,4 +14,5 @@ mod sse; mod sse2; mod ssse3; mod sse42; +mod avx; mod avx2; \ No newline at end of file