Integers bitcasted to vectors, coerced to vectors of larger integers of a smaller size than the original can also be optimized

Similar to https://github.com/llvm/llvm-project/issues/79094.

Here is the code I want to write ([Godbolt](https://zig.godbolt.org/z/7n51ah65j)).

```zig
export fn _1(x: u64) @Vector(16, u8) {
    return @as(@Vector(16, u4), @bitCast(x));
}
```

I want to turn a 64-bit integer into a vector of 16 u4's, then coerce that to a vector of 16 u8's. In the end, the elements of the vector of 16 u8's will have successive groups of 4 bits from the input.


<details>
  <summary>Unfortunately, the assembly for that is very long, so I have to use this drop-down.</summary>

## x86-64 (znver4)

  ```asm
.LCPI0_0:
        .byte   2
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .byte   3
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .byte   15
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .byte   15
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_2:
        .byte   4
        .zero   1
        .zero   1
        .zero   1
        .byte   5
        .zero   1
        .zero   1
        .zero   1
        .byte   6
        .zero   1
        .zero   1
        .zero   1
        .byte   7
        .zero   1
        .zero   1
        .zero   1
        .short  15
        .short  15
        .short  15
        .short  15
        .short  15
        .short  15
        .short  15
        .short  15
.LCPI0_3:
        .long   15
.LCPI0_5:
        .zero   4,15
.LCPI0_6:
        .short  15
        .short  15
expandU64To16xu4AsByteVector1:
        mov     eax, 1032
        mov     edx, edi
        and     edx, 15
        bextr   ecx, edi, eax
        mov     eax, 1028
        vmovd   xmm0, edx
        bextr   eax, edi, eax
        vpinsrb xmm0, xmm0, eax, 1
        vpinsrb xmm0, xmm0, ecx, 2
        mov     ecx, 1036
        bextr   ecx, edi, ecx
        vpinsrb xmm0, xmm0, ecx, 3
        mov     ecx, 1040
        bextr   ecx, edi, ecx
        vpinsrb xmm1, xmm0, ecx, 4
        mov     ecx, 1044
        vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        bextr   ecx, edi, ecx
        vpandd  xmm0, xmm0, dword ptr [rip + .LCPI0_5]{1to4}
        vpinsrb xmm1, xmm1, ecx, 5
        mov     ecx, 1048
        bextr   ecx, edi, ecx
        vpinsrb xmm1, xmm1, ecx, 6
        mov     ecx, edi
        shr     ecx, 28
        vpinsrb xmm1, xmm1, ecx, 7
        mov     rcx, rdi
        shr     rcx, 32
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 8
        mov     rcx, rdi
        shr     rcx, 36
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 9
        mov     rcx, rdi
        shr     rcx, 40
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 10
        mov     rcx, rdi
        shr     rcx, 44
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 11
        mov     rcx, rdi
        shr     rcx, 48
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 12
        mov     rcx, rdi
        shr     rcx, 52
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 13
        mov     rcx, rdi
        shr     rcx, 56
        and     ecx, 15
        vpinsrb xmm1, xmm1, ecx, 14
        mov     rcx, rdi
        shr     rcx, 60
        and     dil, 15
        vpinsrb xmm2, xmm1, ecx, 15
        vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_2]
        vmovd   xmm3, edi
        vpandd  xmm1, xmm1, dword ptr [rip + .LCPI0_3]{1to4}
        vpunpckhbw      xmm2, xmm2, xmm2
        vpandd  xmm2, xmm2, dword ptr [rip + .LCPI0_6]{1to4}
        vpinsrb xmm3, xmm3, eax, 1
        vpmovdb xmm1, zmm1
        vpackuswb       xmm2, xmm2, xmm2
        vpmovqb xmm0, xmm0
        vpunpcklwd      xmm0, xmm3, xmm0
        vpunpckldq      xmm0, xmm0, xmm1
        vpunpcklqdq     xmm0, xmm0, xmm2
        vzeroupper
        ret
  ```

## aarch64 (apple_latest)

```asm
expandU64To16xu4AsByteVector1:
        sub     sp, sp, #16
        ubfx    w8, w0, #4, #4
        and     w9, w0, #0xf
        fmov    s0, w9
        mov     v0.b[1], w8
        ubfx    w8, w0, #8, #4
        mov     v0.b[2], w8
        ubfx    w8, w0, #12, #4
        mov     v0.b[3], w8
        ubfx    w8, w0, #16, #4
        mov     v0.b[4], w8
        ubfx    w8, w0, #20, #4
        mov     v0.b[5], w8
        ubfx    w8, w0, #24, #4
        mov     v0.b[6], w8
        ubfx    x8, x0, #28, #4
        mov     v0.b[7], w8
        ubfx    x8, x0, #32, #4
        mov     v0.b[8], w8
        ubfx    x8, x0, #36, #4
        mov     v0.b[9], w8
        ubfx    x8, x0, #40, #4
        mov     v1.16b, v0.16b
        mov     v1.b[10], w8
        ubfx    x9, x0, #44, #4
        mov     v1.b[11], w9
        ubfx    x10, x0, #48, #4
        mov     v1.b[12], w10
        ubfx    x10, x0, #52, #4
        mov     v1.b[13], w10
        ubfx    x10, x0, #56, #4
        mov     v1.b[14], w10
        lsr     x10, x0, #60
        mov     v1.b[15], w10
        fmov    s2, w8
        mov     v2.s[1], w9
        movi    v3.2s, #15
        and     v2.8b, v2.8b, v3.8b
        ext     v1.16b, v1.16b, v1.16b, #8
        zip2    v1.8b, v1.8b, v0.8b
        bic     v1.4h, #255, lsl #8
        mov     v2.b[1], v2.b[4]
        mov     v0.h[5], v2.h[0]
        xtn     v1.8b, v1.8h
        mov     v0.s[3], v1.s[0]
        add     sp, sp, #16
        ret
```

</details>

You can coerce LLVM to produce efficient code for this operation, using one of the following routines.

```zig
export fn _2(x: u64) @Vector(16, u8) {
    const vec: @Vector(8, u8) = @bitCast(x);
    return @shuffle(
        u8,
        vec,
        vec >> @splat(4),
        std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
    ) & @as(@Vector(16, u8), @splat(0xF));
}

export fn _3(x: u64) @Vector(16, u8) {
    const vec: @Vector(8, u8) = @bitCast(x);
    return @shuffle(
        u8,
        vec & @as(@Vector(8, u8), @splat(0xF)),
        vec >> @splat(4),
        std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
    );
}
```

Here is the assembly on x86_64 (znver4):

```asm
.LCPI0_1:
        .zero   4,15
_2:
        vmovq   xmm0, rdi
        vpsrlw  xmm1, xmm0, 4
        vpunpcklbw      xmm0, xmm0, xmm1
        vpandd  xmm0, xmm0, dword ptr [rip + .LCPI0_1]{1to4}
        ret

.LCPI1_2:
        .zero   4,15
_3:
        vmovq   xmm0, rdi
        vpandd  xmm1, xmm0, dword ptr [rip + .LCPI1_2]{1to4}
        vpsrlw  xmm0, xmm0, 4
        vpandd  xmm0, xmm0, dword ptr [rip + .LCPI1_2]{1to4}
        vpunpcklbw      xmm0, xmm1, xmm0
        ret
```

Here is the assembly on aarch64 (apple_latest):

```asm
_2:
        fmov    d0, x0
        fmov    d1, x0
        ushr    v1.8b, v1.8b, #4
        zip1    v0.16b, v0.16b, v1.16b
        movi    v1.16b, #15
        and     v0.16b, v0.16b, v1.16b
        ret

_3:
        fmov    d0, x0
        movi    v1.8b, #15
        and     v1.8b, v0.8b, v1.8b
        ushr    v0.8b, v0.8b, #4
        zip1    v0.16b, v1.16b, v0.16b
        ret
```

Unfortunately, there is no clear winner here across architectures.  `_2` looks more efficient on x86_64 (znver4) and `_3` looks more efficient on aarch64 (apple_latest), although I could be wrong. It is sort of unfortunate that LLVM does not take a side with the hand-rolled code and just compile all of my implementations in exactly the same way, but I would be happy with the simple and easy approach giving me optimal code.



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Integers bitcasted to vectors, coerced to vectors of larger integers of a smaller size than the original can also be optimized #79100

x86-64 (znver4)

aarch64 (apple_latest)

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Integers bitcasted to vectors, coerced to vectors of larger integers of a smaller size than the original can also be optimized #79100

Description

x86-64 (znver4)

aarch64 (apple_latest)

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions