Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
341 changes: 64 additions & 277 deletions roaring/src/bitmap/store/array_store/vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
use super::scalar;
use core::simd::cmp::{SimdPartialEq, SimdPartialOrd};
use core::simd::{
mask16x8, simd_swizzle, u16x8, LaneCount, Mask, Simd, SimdElement, SupportedLaneCount,
mask16x8, u16x8, u8x16, LaneCount, Mask, Simd, SimdElement, SupportedLaneCount, ToBytes,
};

// a one-pass SSE union algorithm
Expand Down Expand Up @@ -484,283 +484,70 @@ fn simd_merge_u16(a: Simd<u16, 8>, b: Simd<u16, 8>) -> [Simd<u16, 8>; 2] {
/// Move the values in `val` with the corresponding index in `bitmask`
/// set to the front of the return vector, preserving their order.
///
/// This had to be implemented as a jump table to be portable,
/// as LLVM swizzle intrinsic only supports swizzle by a const
/// value. https://github.com/rust-lang/portable-simd/issues/11
///
/// The values in the return vector after index bitmask.count_ones() is unspecified.
///
/// The masks can be constructed with the following snippet
/// ```ignore
/// for n in 0usize..256 {
/// let mut x = n;
/// let mut arr = [0; 8];
/// let mut i = 0;
/// while x > 0 {
/// let lsb = x.trailing_zeros();
/// arr[i] = lsb;
/// x ^= 1 << lsb;
/// i += 1;
/// }
/// }
/// ```
// Dynamic swizzle is only available for `u8`s.
//
// So we need to convert the `u16x8` to `u8x16`, and then swizzle it two lanes at a time.
//
// e.g. if `bitmask` is `0b0101`, then swizzle the first two bytes (the first u16 lane) to the
// first two positions, and the 5th and 6th bytes (the third u16 lane) to the next two positions.
//
// Note however:
// https://github.com/rust-lang/rust/blob/34097a38afc9efdedf776d3f1c84a190ff334886/library/portable-simd/crates/core_simd/src/swizzle_dyn.rs#L12-L15
// > Note that the current implementation is selected during build-time
// > of the standard library, so `cargo build -Zbuild-std` may be necessary
// > to unlock better performance, especially for larger vectors.
// > A planned compiler improvement will enable using `#[target_feature]` instead.
pub fn swizzle_to_front(val: u16x8, bitmask: u8) -> u16x8 {
match bitmask {
0x00 => simd_swizzle!(val, [0, 0, 0, 0, 0, 0, 0, 0]),
0x01 => simd_swizzle!(val, [0, 0, 0, 0, 0, 0, 0, 0]),
0x02 => simd_swizzle!(val, [1, 0, 0, 0, 0, 0, 0, 0]),
0x03 => simd_swizzle!(val, [0, 1, 0, 0, 0, 0, 0, 0]),
0x04 => simd_swizzle!(val, [2, 0, 0, 0, 0, 0, 0, 0]),
0x05 => simd_swizzle!(val, [0, 2, 0, 0, 0, 0, 0, 0]),
0x06 => simd_swizzle!(val, [1, 2, 0, 0, 0, 0, 0, 0]),
0x07 => simd_swizzle!(val, [0, 1, 2, 0, 0, 0, 0, 0]),
0x08 => simd_swizzle!(val, [3, 0, 0, 0, 0, 0, 0, 0]),
0x09 => simd_swizzle!(val, [0, 3, 0, 0, 0, 0, 0, 0]),
0x0A => simd_swizzle!(val, [1, 3, 0, 0, 0, 0, 0, 0]),
0x0B => simd_swizzle!(val, [0, 1, 3, 0, 0, 0, 0, 0]),
0x0C => simd_swizzle!(val, [2, 3, 0, 0, 0, 0, 0, 0]),
0x0D => simd_swizzle!(val, [0, 2, 3, 0, 0, 0, 0, 0]),
0x0E => simd_swizzle!(val, [1, 2, 3, 0, 0, 0, 0, 0]),
0x0F => simd_swizzle!(val, [0, 1, 2, 3, 0, 0, 0, 0]),
0x10 => simd_swizzle!(val, [4, 0, 0, 0, 0, 0, 0, 0]),
0x11 => simd_swizzle!(val, [0, 4, 0, 0, 0, 0, 0, 0]),
0x12 => simd_swizzle!(val, [1, 4, 0, 0, 0, 0, 0, 0]),
0x13 => simd_swizzle!(val, [0, 1, 4, 0, 0, 0, 0, 0]),
0x14 => simd_swizzle!(val, [2, 4, 0, 0, 0, 0, 0, 0]),
0x15 => simd_swizzle!(val, [0, 2, 4, 0, 0, 0, 0, 0]),
0x16 => simd_swizzle!(val, [1, 2, 4, 0, 0, 0, 0, 0]),
0x17 => simd_swizzle!(val, [0, 1, 2, 4, 0, 0, 0, 0]),
0x18 => simd_swizzle!(val, [3, 4, 0, 0, 0, 0, 0, 0]),
0x19 => simd_swizzle!(val, [0, 3, 4, 0, 0, 0, 0, 0]),
0x1A => simd_swizzle!(val, [1, 3, 4, 0, 0, 0, 0, 0]),
0x1B => simd_swizzle!(val, [0, 1, 3, 4, 0, 0, 0, 0]),
0x1C => simd_swizzle!(val, [2, 3, 4, 0, 0, 0, 0, 0]),
0x1D => simd_swizzle!(val, [0, 2, 3, 4, 0, 0, 0, 0]),
0x1E => simd_swizzle!(val, [1, 2, 3, 4, 0, 0, 0, 0]),
0x1F => simd_swizzle!(val, [0, 1, 2, 3, 4, 0, 0, 0]),
0x20 => simd_swizzle!(val, [5, 0, 0, 0, 0, 0, 0, 0]),
0x21 => simd_swizzle!(val, [0, 5, 0, 0, 0, 0, 0, 0]),
0x22 => simd_swizzle!(val, [1, 5, 0, 0, 0, 0, 0, 0]),
0x23 => simd_swizzle!(val, [0, 1, 5, 0, 0, 0, 0, 0]),
0x24 => simd_swizzle!(val, [2, 5, 0, 0, 0, 0, 0, 0]),
0x25 => simd_swizzle!(val, [0, 2, 5, 0, 0, 0, 0, 0]),
0x26 => simd_swizzle!(val, [1, 2, 5, 0, 0, 0, 0, 0]),
0x27 => simd_swizzle!(val, [0, 1, 2, 5, 0, 0, 0, 0]),
0x28 => simd_swizzle!(val, [3, 5, 0, 0, 0, 0, 0, 0]),
0x29 => simd_swizzle!(val, [0, 3, 5, 0, 0, 0, 0, 0]),
0x2A => simd_swizzle!(val, [1, 3, 5, 0, 0, 0, 0, 0]),
0x2B => simd_swizzle!(val, [0, 1, 3, 5, 0, 0, 0, 0]),
0x2C => simd_swizzle!(val, [2, 3, 5, 0, 0, 0, 0, 0]),
0x2D => simd_swizzle!(val, [0, 2, 3, 5, 0, 0, 0, 0]),
0x2E => simd_swizzle!(val, [1, 2, 3, 5, 0, 0, 0, 0]),
0x2F => simd_swizzle!(val, [0, 1, 2, 3, 5, 0, 0, 0]),
0x30 => simd_swizzle!(val, [4, 5, 0, 0, 0, 0, 0, 0]),
0x31 => simd_swizzle!(val, [0, 4, 5, 0, 0, 0, 0, 0]),
0x32 => simd_swizzle!(val, [1, 4, 5, 0, 0, 0, 0, 0]),
0x33 => simd_swizzle!(val, [0, 1, 4, 5, 0, 0, 0, 0]),
0x34 => simd_swizzle!(val, [2, 4, 5, 0, 0, 0, 0, 0]),
0x35 => simd_swizzle!(val, [0, 2, 4, 5, 0, 0, 0, 0]),
0x36 => simd_swizzle!(val, [1, 2, 4, 5, 0, 0, 0, 0]),
0x37 => simd_swizzle!(val, [0, 1, 2, 4, 5, 0, 0, 0]),
0x38 => simd_swizzle!(val, [3, 4, 5, 0, 0, 0, 0, 0]),
0x39 => simd_swizzle!(val, [0, 3, 4, 5, 0, 0, 0, 0]),
0x3A => simd_swizzle!(val, [1, 3, 4, 5, 0, 0, 0, 0]),
0x3B => simd_swizzle!(val, [0, 1, 3, 4, 5, 0, 0, 0]),
0x3C => simd_swizzle!(val, [2, 3, 4, 5, 0, 0, 0, 0]),
0x3D => simd_swizzle!(val, [0, 2, 3, 4, 5, 0, 0, 0]),
0x3E => simd_swizzle!(val, [1, 2, 3, 4, 5, 0, 0, 0]),
0x3F => simd_swizzle!(val, [0, 1, 2, 3, 4, 5, 0, 0]),
0x40 => simd_swizzle!(val, [6, 0, 0, 0, 0, 0, 0, 0]),
0x41 => simd_swizzle!(val, [0, 6, 0, 0, 0, 0, 0, 0]),
0x42 => simd_swizzle!(val, [1, 6, 0, 0, 0, 0, 0, 0]),
0x43 => simd_swizzle!(val, [0, 1, 6, 0, 0, 0, 0, 0]),
0x44 => simd_swizzle!(val, [2, 6, 0, 0, 0, 0, 0, 0]),
0x45 => simd_swizzle!(val, [0, 2, 6, 0, 0, 0, 0, 0]),
0x46 => simd_swizzle!(val, [1, 2, 6, 0, 0, 0, 0, 0]),
0x47 => simd_swizzle!(val, [0, 1, 2, 6, 0, 0, 0, 0]),
0x48 => simd_swizzle!(val, [3, 6, 0, 0, 0, 0, 0, 0]),
0x49 => simd_swizzle!(val, [0, 3, 6, 0, 0, 0, 0, 0]),
0x4A => simd_swizzle!(val, [1, 3, 6, 0, 0, 0, 0, 0]),
0x4B => simd_swizzle!(val, [0, 1, 3, 6, 0, 0, 0, 0]),
0x4C => simd_swizzle!(val, [2, 3, 6, 0, 0, 0, 0, 0]),
0x4D => simd_swizzle!(val, [0, 2, 3, 6, 0, 0, 0, 0]),
0x4E => simd_swizzle!(val, [1, 2, 3, 6, 0, 0, 0, 0]),
0x4F => simd_swizzle!(val, [0, 1, 2, 3, 6, 0, 0, 0]),
0x50 => simd_swizzle!(val, [4, 6, 0, 0, 0, 0, 0, 0]),
0x51 => simd_swizzle!(val, [0, 4, 6, 0, 0, 0, 0, 0]),
0x52 => simd_swizzle!(val, [1, 4, 6, 0, 0, 0, 0, 0]),
0x53 => simd_swizzle!(val, [0, 1, 4, 6, 0, 0, 0, 0]),
0x54 => simd_swizzle!(val, [2, 4, 6, 0, 0, 0, 0, 0]),
0x55 => simd_swizzle!(val, [0, 2, 4, 6, 0, 0, 0, 0]),
0x56 => simd_swizzle!(val, [1, 2, 4, 6, 0, 0, 0, 0]),
0x57 => simd_swizzle!(val, [0, 1, 2, 4, 6, 0, 0, 0]),
0x58 => simd_swizzle!(val, [3, 4, 6, 0, 0, 0, 0, 0]),
0x59 => simd_swizzle!(val, [0, 3, 4, 6, 0, 0, 0, 0]),
0x5A => simd_swizzle!(val, [1, 3, 4, 6, 0, 0, 0, 0]),
0x5B => simd_swizzle!(val, [0, 1, 3, 4, 6, 0, 0, 0]),
0x5C => simd_swizzle!(val, [2, 3, 4, 6, 0, 0, 0, 0]),
0x5D => simd_swizzle!(val, [0, 2, 3, 4, 6, 0, 0, 0]),
0x5E => simd_swizzle!(val, [1, 2, 3, 4, 6, 0, 0, 0]),
0x5F => simd_swizzle!(val, [0, 1, 2, 3, 4, 6, 0, 0]),
0x60 => simd_swizzle!(val, [5, 6, 0, 0, 0, 0, 0, 0]),
0x61 => simd_swizzle!(val, [0, 5, 6, 0, 0, 0, 0, 0]),
0x62 => simd_swizzle!(val, [1, 5, 6, 0, 0, 0, 0, 0]),
0x63 => simd_swizzle!(val, [0, 1, 5, 6, 0, 0, 0, 0]),
0x64 => simd_swizzle!(val, [2, 5, 6, 0, 0, 0, 0, 0]),
0x65 => simd_swizzle!(val, [0, 2, 5, 6, 0, 0, 0, 0]),
0x66 => simd_swizzle!(val, [1, 2, 5, 6, 0, 0, 0, 0]),
0x67 => simd_swizzle!(val, [0, 1, 2, 5, 6, 0, 0, 0]),
0x68 => simd_swizzle!(val, [3, 5, 6, 0, 0, 0, 0, 0]),
0x69 => simd_swizzle!(val, [0, 3, 5, 6, 0, 0, 0, 0]),
0x6A => simd_swizzle!(val, [1, 3, 5, 6, 0, 0, 0, 0]),
0x6B => simd_swizzle!(val, [0, 1, 3, 5, 6, 0, 0, 0]),
0x6C => simd_swizzle!(val, [2, 3, 5, 6, 0, 0, 0, 0]),
0x6D => simd_swizzle!(val, [0, 2, 3, 5, 6, 0, 0, 0]),
0x6E => simd_swizzle!(val, [1, 2, 3, 5, 6, 0, 0, 0]),
0x6F => simd_swizzle!(val, [0, 1, 2, 3, 5, 6, 0, 0]),
0x70 => simd_swizzle!(val, [4, 5, 6, 0, 0, 0, 0, 0]),
0x71 => simd_swizzle!(val, [0, 4, 5, 6, 0, 0, 0, 0]),
0x72 => simd_swizzle!(val, [1, 4, 5, 6, 0, 0, 0, 0]),
0x73 => simd_swizzle!(val, [0, 1, 4, 5, 6, 0, 0, 0]),
0x74 => simd_swizzle!(val, [2, 4, 5, 6, 0, 0, 0, 0]),
0x75 => simd_swizzle!(val, [0, 2, 4, 5, 6, 0, 0, 0]),
0x76 => simd_swizzle!(val, [1, 2, 4, 5, 6, 0, 0, 0]),
0x77 => simd_swizzle!(val, [0, 1, 2, 4, 5, 6, 0, 0]),
0x78 => simd_swizzle!(val, [3, 4, 5, 6, 0, 0, 0, 0]),
0x79 => simd_swizzle!(val, [0, 3, 4, 5, 6, 0, 0, 0]),
0x7A => simd_swizzle!(val, [1, 3, 4, 5, 6, 0, 0, 0]),
0x7B => simd_swizzle!(val, [0, 1, 3, 4, 5, 6, 0, 0]),
0x7C => simd_swizzle!(val, [2, 3, 4, 5, 6, 0, 0, 0]),
0x7D => simd_swizzle!(val, [0, 2, 3, 4, 5, 6, 0, 0]),
0x7E => simd_swizzle!(val, [1, 2, 3, 4, 5, 6, 0, 0]),
0x7F => simd_swizzle!(val, [0, 1, 2, 3, 4, 5, 6, 0]),
0x80 => simd_swizzle!(val, [7, 0, 0, 0, 0, 0, 0, 0]),
0x81 => simd_swizzle!(val, [0, 7, 0, 0, 0, 0, 0, 0]),
0x82 => simd_swizzle!(val, [1, 7, 0, 0, 0, 0, 0, 0]),
0x83 => simd_swizzle!(val, [0, 1, 7, 0, 0, 0, 0, 0]),
0x84 => simd_swizzle!(val, [2, 7, 0, 0, 0, 0, 0, 0]),
0x85 => simd_swizzle!(val, [0, 2, 7, 0, 0, 0, 0, 0]),
0x86 => simd_swizzle!(val, [1, 2, 7, 0, 0, 0, 0, 0]),
0x87 => simd_swizzle!(val, [0, 1, 2, 7, 0, 0, 0, 0]),
0x88 => simd_swizzle!(val, [3, 7, 0, 0, 0, 0, 0, 0]),
0x89 => simd_swizzle!(val, [0, 3, 7, 0, 0, 0, 0, 0]),
0x8A => simd_swizzle!(val, [1, 3, 7, 0, 0, 0, 0, 0]),
0x8B => simd_swizzle!(val, [0, 1, 3, 7, 0, 0, 0, 0]),
0x8C => simd_swizzle!(val, [2, 3, 7, 0, 0, 0, 0, 0]),
0x8D => simd_swizzle!(val, [0, 2, 3, 7, 0, 0, 0, 0]),
0x8E => simd_swizzle!(val, [1, 2, 3, 7, 0, 0, 0, 0]),
0x8F => simd_swizzle!(val, [0, 1, 2, 3, 7, 0, 0, 0]),
0x90 => simd_swizzle!(val, [4, 7, 0, 0, 0, 0, 0, 0]),
0x91 => simd_swizzle!(val, [0, 4, 7, 0, 0, 0, 0, 0]),
0x92 => simd_swizzle!(val, [1, 4, 7, 0, 0, 0, 0, 0]),
0x93 => simd_swizzle!(val, [0, 1, 4, 7, 0, 0, 0, 0]),
0x94 => simd_swizzle!(val, [2, 4, 7, 0, 0, 0, 0, 0]),
0x95 => simd_swizzle!(val, [0, 2, 4, 7, 0, 0, 0, 0]),
0x96 => simd_swizzle!(val, [1, 2, 4, 7, 0, 0, 0, 0]),
0x97 => simd_swizzle!(val, [0, 1, 2, 4, 7, 0, 0, 0]),
0x98 => simd_swizzle!(val, [3, 4, 7, 0, 0, 0, 0, 0]),
0x99 => simd_swizzle!(val, [0, 3, 4, 7, 0, 0, 0, 0]),
0x9A => simd_swizzle!(val, [1, 3, 4, 7, 0, 0, 0, 0]),
0x9B => simd_swizzle!(val, [0, 1, 3, 4, 7, 0, 0, 0]),
0x9C => simd_swizzle!(val, [2, 3, 4, 7, 0, 0, 0, 0]),
0x9D => simd_swizzle!(val, [0, 2, 3, 4, 7, 0, 0, 0]),
0x9E => simd_swizzle!(val, [1, 2, 3, 4, 7, 0, 0, 0]),
0x9F => simd_swizzle!(val, [0, 1, 2, 3, 4, 7, 0, 0]),
0xA0 => simd_swizzle!(val, [5, 7, 0, 0, 0, 0, 0, 0]),
0xA1 => simd_swizzle!(val, [0, 5, 7, 0, 0, 0, 0, 0]),
0xA2 => simd_swizzle!(val, [1, 5, 7, 0, 0, 0, 0, 0]),
0xA3 => simd_swizzle!(val, [0, 1, 5, 7, 0, 0, 0, 0]),
0xA4 => simd_swizzle!(val, [2, 5, 7, 0, 0, 0, 0, 0]),
0xA5 => simd_swizzle!(val, [0, 2, 5, 7, 0, 0, 0, 0]),
0xA6 => simd_swizzle!(val, [1, 2, 5, 7, 0, 0, 0, 0]),
0xA7 => simd_swizzle!(val, [0, 1, 2, 5, 7, 0, 0, 0]),
0xA8 => simd_swizzle!(val, [3, 5, 7, 0, 0, 0, 0, 0]),
0xA9 => simd_swizzle!(val, [0, 3, 5, 7, 0, 0, 0, 0]),
0xAA => simd_swizzle!(val, [1, 3, 5, 7, 0, 0, 0, 0]),
0xAB => simd_swizzle!(val, [0, 1, 3, 5, 7, 0, 0, 0]),
0xAC => simd_swizzle!(val, [2, 3, 5, 7, 0, 0, 0, 0]),
0xAD => simd_swizzle!(val, [0, 2, 3, 5, 7, 0, 0, 0]),
0xAE => simd_swizzle!(val, [1, 2, 3, 5, 7, 0, 0, 0]),
0xAF => simd_swizzle!(val, [0, 1, 2, 3, 5, 7, 0, 0]),
0xB0 => simd_swizzle!(val, [4, 5, 7, 0, 0, 0, 0, 0]),
0xB1 => simd_swizzle!(val, [0, 4, 5, 7, 0, 0, 0, 0]),
0xB2 => simd_swizzle!(val, [1, 4, 5, 7, 0, 0, 0, 0]),
0xB3 => simd_swizzle!(val, [0, 1, 4, 5, 7, 0, 0, 0]),
0xB4 => simd_swizzle!(val, [2, 4, 5, 7, 0, 0, 0, 0]),
0xB5 => simd_swizzle!(val, [0, 2, 4, 5, 7, 0, 0, 0]),
0xB6 => simd_swizzle!(val, [1, 2, 4, 5, 7, 0, 0, 0]),
0xB7 => simd_swizzle!(val, [0, 1, 2, 4, 5, 7, 0, 0]),
0xB8 => simd_swizzle!(val, [3, 4, 5, 7, 0, 0, 0, 0]),
0xB9 => simd_swizzle!(val, [0, 3, 4, 5, 7, 0, 0, 0]),
0xBA => simd_swizzle!(val, [1, 3, 4, 5, 7, 0, 0, 0]),
0xBB => simd_swizzle!(val, [0, 1, 3, 4, 5, 7, 0, 0]),
0xBC => simd_swizzle!(val, [2, 3, 4, 5, 7, 0, 0, 0]),
0xBD => simd_swizzle!(val, [0, 2, 3, 4, 5, 7, 0, 0]),
0xBE => simd_swizzle!(val, [1, 2, 3, 4, 5, 7, 0, 0]),
0xBF => simd_swizzle!(val, [0, 1, 2, 3, 4, 5, 7, 0]),
0xC0 => simd_swizzle!(val, [6, 7, 0, 0, 0, 0, 0, 0]),
0xC1 => simd_swizzle!(val, [0, 6, 7, 0, 0, 0, 0, 0]),
0xC2 => simd_swizzle!(val, [1, 6, 7, 0, 0, 0, 0, 0]),
0xC3 => simd_swizzle!(val, [0, 1, 6, 7, 0, 0, 0, 0]),
0xC4 => simd_swizzle!(val, [2, 6, 7, 0, 0, 0, 0, 0]),
0xC5 => simd_swizzle!(val, [0, 2, 6, 7, 0, 0, 0, 0]),
0xC6 => simd_swizzle!(val, [1, 2, 6, 7, 0, 0, 0, 0]),
0xC7 => simd_swizzle!(val, [0, 1, 2, 6, 7, 0, 0, 0]),
0xC8 => simd_swizzle!(val, [3, 6, 7, 0, 0, 0, 0, 0]),
0xC9 => simd_swizzle!(val, [0, 3, 6, 7, 0, 0, 0, 0]),
0xCA => simd_swizzle!(val, [1, 3, 6, 7, 0, 0, 0, 0]),
0xCB => simd_swizzle!(val, [0, 1, 3, 6, 7, 0, 0, 0]),
0xCC => simd_swizzle!(val, [2, 3, 6, 7, 0, 0, 0, 0]),
0xCD => simd_swizzle!(val, [0, 2, 3, 6, 7, 0, 0, 0]),
0xCE => simd_swizzle!(val, [1, 2, 3, 6, 7, 0, 0, 0]),
0xCF => simd_swizzle!(val, [0, 1, 2, 3, 6, 7, 0, 0]),
0xD0 => simd_swizzle!(val, [4, 6, 7, 0, 0, 0, 0, 0]),
0xD1 => simd_swizzle!(val, [0, 4, 6, 7, 0, 0, 0, 0]),
0xD2 => simd_swizzle!(val, [1, 4, 6, 7, 0, 0, 0, 0]),
0xD3 => simd_swizzle!(val, [0, 1, 4, 6, 7, 0, 0, 0]),
0xD4 => simd_swizzle!(val, [2, 4, 6, 7, 0, 0, 0, 0]),
0xD5 => simd_swizzle!(val, [0, 2, 4, 6, 7, 0, 0, 0]),
0xD6 => simd_swizzle!(val, [1, 2, 4, 6, 7, 0, 0, 0]),
0xD7 => simd_swizzle!(val, [0, 1, 2, 4, 6, 7, 0, 0]),
0xD8 => simd_swizzle!(val, [3, 4, 6, 7, 0, 0, 0, 0]),
0xD9 => simd_swizzle!(val, [0, 3, 4, 6, 7, 0, 0, 0]),
0xDA => simd_swizzle!(val, [1, 3, 4, 6, 7, 0, 0, 0]),
0xDB => simd_swizzle!(val, [0, 1, 3, 4, 6, 7, 0, 0]),
0xDC => simd_swizzle!(val, [2, 3, 4, 6, 7, 0, 0, 0]),
0xDD => simd_swizzle!(val, [0, 2, 3, 4, 6, 7, 0, 0]),
0xDE => simd_swizzle!(val, [1, 2, 3, 4, 6, 7, 0, 0]),
0xDF => simd_swizzle!(val, [0, 1, 2, 3, 4, 6, 7, 0]),
0xE0 => simd_swizzle!(val, [5, 6, 7, 0, 0, 0, 0, 0]),
0xE1 => simd_swizzle!(val, [0, 5, 6, 7, 0, 0, 0, 0]),
0xE2 => simd_swizzle!(val, [1, 5, 6, 7, 0, 0, 0, 0]),
0xE3 => simd_swizzle!(val, [0, 1, 5, 6, 7, 0, 0, 0]),
0xE4 => simd_swizzle!(val, [2, 5, 6, 7, 0, 0, 0, 0]),
0xE5 => simd_swizzle!(val, [0, 2, 5, 6, 7, 0, 0, 0]),
0xE6 => simd_swizzle!(val, [1, 2, 5, 6, 7, 0, 0, 0]),
0xE7 => simd_swizzle!(val, [0, 1, 2, 5, 6, 7, 0, 0]),
0xE8 => simd_swizzle!(val, [3, 5, 6, 7, 0, 0, 0, 0]),
0xE9 => simd_swizzle!(val, [0, 3, 5, 6, 7, 0, 0, 0]),
0xEA => simd_swizzle!(val, [1, 3, 5, 6, 7, 0, 0, 0]),
0xEB => simd_swizzle!(val, [0, 1, 3, 5, 6, 7, 0, 0]),
0xEC => simd_swizzle!(val, [2, 3, 5, 6, 7, 0, 0, 0]),
0xED => simd_swizzle!(val, [0, 2, 3, 5, 6, 7, 0, 0]),
0xEE => simd_swizzle!(val, [1, 2, 3, 5, 6, 7, 0, 0]),
0xEF => simd_swizzle!(val, [0, 1, 2, 3, 5, 6, 7, 0]),
0xF0 => simd_swizzle!(val, [4, 5, 6, 7, 0, 0, 0, 0]),
0xF1 => simd_swizzle!(val, [0, 4, 5, 6, 7, 0, 0, 0]),
0xF2 => simd_swizzle!(val, [1, 4, 5, 6, 7, 0, 0, 0]),
0xF3 => simd_swizzle!(val, [0, 1, 4, 5, 6, 7, 0, 0]),
0xF4 => simd_swizzle!(val, [2, 4, 5, 6, 7, 0, 0, 0]),
0xF5 => simd_swizzle!(val, [0, 2, 4, 5, 6, 7, 0, 0]),
0xF6 => simd_swizzle!(val, [1, 2, 4, 5, 6, 7, 0, 0]),
0xF7 => simd_swizzle!(val, [0, 1, 2, 4, 5, 6, 7, 0]),
0xF8 => simd_swizzle!(val, [3, 4, 5, 6, 7, 0, 0, 0]),
0xF9 => simd_swizzle!(val, [0, 3, 4, 5, 6, 7, 0, 0]),
0xFA => simd_swizzle!(val, [1, 3, 4, 5, 6, 7, 0, 0]),
0xFB => simd_swizzle!(val, [0, 1, 3, 4, 5, 6, 7, 0]),
0xFC => simd_swizzle!(val, [2, 3, 4, 5, 6, 7, 0, 0]),
0xFD => simd_swizzle!(val, [0, 2, 3, 4, 5, 6, 7, 0]),
0xFE => simd_swizzle!(val, [1, 2, 3, 4, 5, 6, 7, 0]),
0xFF => simd_swizzle!(val, [0, 1, 2, 3, 4, 5, 6, 7]),
static SWIZZLE_TABLE: [[u8; 16]; 256] = {
let mut table = [[0; 16]; 256];
let mut n = 0usize;
while n < table.len() {
let mut x = n;
let mut i = 0;
while x > 0 {
let lsb = x.trailing_zeros() as u8;
x ^= 1 << lsb;
table[n][i] = lsb * 2; // first byte
table[n][i + 1] = lsb * 2 + 1; // second byte
i += 2;
}
n += 1;
}
table
};

// Our swizzle table retains the order of the bytes in the 16 bit lanes, we can
// stick with native byte order as long as we convert back with native endianness too.
let val_convert: u8x16 = val.to_ne_bytes();
let swizzle_idxs = u8x16::from_array(SWIZZLE_TABLE[bitmask as usize]);

// Because the default `x86_64` target does not enable ssse3 (and without -Zbuild-std`
// std will not be compiled with it), use a manual swizzle with intrinsics so we can get
// reasonable performance without requiring the caller to use `-Zbuild-std`.
#[cfg(all(target_arch = "x86_64", any(target_feature = "ssse3", feature = "std")))]
{
let has_ssse3 = {
#[cfg(target_feature = "ssse3")]
{
true
}
#[cfg(not(target_feature = "ssse3"))]
{
// From above, `feature = std` must be true here, so we can do runtime detection
std::arch::is_x86_feature_detected!("ssse3")
}
};
if has_ssse3 {
use core::arch::x86_64::{__m128i, _mm_shuffle_epi8};
let val_m128 = __m128i::from(val_convert);
let swizzle_m128 = __m128i::from(swizzle_idxs);
// SAFETY: We only are in this block if the target supports `ssse3`
let swizzled_m128 = unsafe { _mm_shuffle_epi8(val_m128, swizzle_m128) };
return u16x8::from(swizzled_m128);
}
}

let swizzled: u8x16 = val_convert.swizzle_dyn(swizzle_idxs);
u16x8::from_ne_bytes(swizzled)
}
Loading