Skip to content

Commit 31de4ec

Browse files
committed
add an implementation using intrinsics for swizzle_to_front for ssse3
1 parent 641cdf1 commit 31de4ec

File tree

1 file changed

+23
-3
lines changed
  • roaring/src/bitmap/store/array_store

1 file changed

+23
-3
lines changed

roaring/src/bitmap/store/array_store/vector.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -498,9 +498,6 @@ fn simd_merge_u16(a: Simd<u16, 8>, b: Simd<u16, 8>) -> [Simd<u16, 8>; 2] {
498498
// > of the standard library, so `cargo build -Zbuild-std` may be necessary
499499
// > to unlock better performance, especially for larger vectors.
500500
// > A planned compiler improvement will enable using `#[target_feature]` instead.
501-
//
502-
// Specifically, e.g. the default `x86_64` target does not enable ssse3, so this may be
503-
// suboptimal without `-Zbuild-std` on `x86_64` targets.
504501
pub fn swizzle_to_front(val: u16x8, bitmask: u8) -> u16x8 {
505502
static SWIZZLE_TABLE: [[u8; 16]; 256] = {
506503
let mut table = [[0; 16]; 256];
@@ -525,6 +522,29 @@ pub fn swizzle_to_front(val: u16x8, bitmask: u8) -> u16x8 {
525522
let val_convert: u8x16 = val.to_ne_bytes();
526523
let swizzle_idxs = u8x16::from_array(SWIZZLE_TABLE[bitmask as usize]);
527524

525+
// Because the default `x86_64` target does not enable ssse3 (and without -Zbuild-std`
526+
// std will not be compiled with it), use a manual swizzle with intrinsics so we can get
527+
// reasonable performance without requiring the caller to use `-Zbuild-std`.
528+
#[cfg(all(target_arch = "x86_64", any(target_feature = "ssse3", feature = "std")))]
529+
{
530+
let has_ssse3 = {
531+
#[cfg(target_feature = "ssse3")]
532+
{ true }
533+
#[cfg(not(target_feature = "ssse3"))]
534+
{
535+
// From above, `feature = std` must be true here, so we can do runtime detection
536+
std::arch::is_x86_feature_detected!("ssse3")
537+
}
538+
};
539+
if has_ssse3 {
540+
use core::arch::x86_64::{__m128i, _mm_shuffle_epi8};
541+
let val_m128 = __m128i::from(val_convert);
542+
let swizzle_m128 = __m128i::from(swizzle_idxs);
543+
let swizzled_m128 = unsafe { _mm_shuffle_epi8(val_m128, swizzle_m128) };
544+
return u16x8::from(swizzled_m128);
545+
}
546+
}
547+
528548
let swizzled: u8x16 = val_convert.swizzle_dyn(swizzle_idxs);
529549
u16x8::from_ne_bytes(swizzled)
530550
}

0 commit comments

Comments
 (0)