@@ -498,9 +498,6 @@ fn simd_merge_u16(a: Simd<u16, 8>, b: Simd<u16, 8>) -> [Simd<u16, 8>; 2] {
498498// > of the standard library, so `cargo build -Zbuild-std` may be necessary
499499// > to unlock better performance, especially for larger vectors.
500500// > A planned compiler improvement will enable using `#[target_feature]` instead.
501- //
502- // Specifically, e.g. the default `x86_64` target does not enable ssse3, so this may be
503- // suboptimal without `-Zbuild-std` on `x86_64` targets.
504501pub fn swizzle_to_front ( val : u16x8 , bitmask : u8 ) -> u16x8 {
505502 static SWIZZLE_TABLE : [ [ u8 ; 16 ] ; 256 ] = {
506503 let mut table = [ [ 0 ; 16 ] ; 256 ] ;
@@ -525,6 +522,29 @@ pub fn swizzle_to_front(val: u16x8, bitmask: u8) -> u16x8 {
525522 let val_convert: u8x16 = val. to_ne_bytes ( ) ;
526523 let swizzle_idxs = u8x16:: from_array ( SWIZZLE_TABLE [ bitmask as usize ] ) ;
527524
525+ // Because the default `x86_64` target does not enable ssse3 (and without -Zbuild-std`
526+ // std will not be compiled with it), use a manual swizzle with intrinsics so we can get
527+ // reasonable performance without requiring the caller to use `-Zbuild-std`.
528+ #[ cfg( all( target_arch = "x86_64" , any( target_feature = "ssse3" , feature = "std" ) ) ) ]
529+ {
530+ let has_ssse3 = {
531+ #[ cfg( target_feature = "ssse3" ) ]
532+ { true }
533+ #[ cfg( not( target_feature = "ssse3" ) ) ]
534+ {
535+ // From above, `feature = std` must be true here, so we can do runtime detection
536+ std:: arch:: is_x86_feature_detected!( "ssse3" )
537+ }
538+ } ;
539+ if has_ssse3 {
540+ use core:: arch:: x86_64:: { __m128i, _mm_shuffle_epi8} ;
541+ let val_m128 = __m128i:: from ( val_convert) ;
542+ let swizzle_m128 = __m128i:: from ( swizzle_idxs) ;
543+ let swizzled_m128 = unsafe { _mm_shuffle_epi8 ( val_m128, swizzle_m128) } ;
544+ return u16x8:: from ( swizzled_m128) ;
545+ }
546+ }
547+
528548 let swizzled: u8x16 = val_convert. swizzle_dyn ( swizzle_idxs) ;
529549 u16x8:: from_ne_bytes ( swizzled)
530550}
0 commit comments