Skip to content

Integers bitcasted to vectors, coerced to vectors of larger integers of a smaller size than the original can also be optimized #79100

Open
@Validark

Description

@Validark

Similar to #79094.

Here is the code I want to write (Godbolt).

export fn _1(x: u64) @Vector(16, u8) {
    return @as(@Vector(16, u4), @bitCast(x));
}

I want to turn a 64-bit integer into a vector of 16 u4's, then coerce that to a vector of 16 u8's. In the end, the elements of the vector of 16 u8's will have successive groups of 4 bits from the input.

Unfortunately, the assembly for that is very long, so I have to use this drop-down.

x86-64 (znver4)

.LCPI0_0:
      .byte   2
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .byte   3
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .byte   15
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .byte   15
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
      .zero   1
.LCPI0_2:
      .byte   4
      .zero   1
      .zero   1
      .zero   1
      .byte   5
      .zero   1
      .zero   1
      .zero   1
      .byte   6
      .zero   1
      .zero   1
      .zero   1
      .byte   7
      .zero   1
      .zero   1
      .zero   1
      .short  15
      .short  15
      .short  15
      .short  15
      .short  15
      .short  15
      .short  15
      .short  15
.LCPI0_3:
      .long   15
.LCPI0_5:
      .zero   4,15
.LCPI0_6:
      .short  15
      .short  15
expandU64To16xu4AsByteVector1:
      mov     eax, 1032
      mov     edx, edi
      and     edx, 15
      bextr   ecx, edi, eax
      mov     eax, 1028
      vmovd   xmm0, edx
      bextr   eax, edi, eax
      vpinsrb xmm0, xmm0, eax, 1
      vpinsrb xmm0, xmm0, ecx, 2
      mov     ecx, 1036
      bextr   ecx, edi, ecx
      vpinsrb xmm0, xmm0, ecx, 3
      mov     ecx, 1040
      bextr   ecx, edi, ecx
      vpinsrb xmm1, xmm0, ecx, 4
      mov     ecx, 1044
      vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
      bextr   ecx, edi, ecx
      vpandd  xmm0, xmm0, dword ptr [rip + .LCPI0_5]{1to4}
      vpinsrb xmm1, xmm1, ecx, 5
      mov     ecx, 1048
      bextr   ecx, edi, ecx
      vpinsrb xmm1, xmm1, ecx, 6
      mov     ecx, edi
      shr     ecx, 28
      vpinsrb xmm1, xmm1, ecx, 7
      mov     rcx, rdi
      shr     rcx, 32
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 8
      mov     rcx, rdi
      shr     rcx, 36
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 9
      mov     rcx, rdi
      shr     rcx, 40
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 10
      mov     rcx, rdi
      shr     rcx, 44
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 11
      mov     rcx, rdi
      shr     rcx, 48
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 12
      mov     rcx, rdi
      shr     rcx, 52
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 13
      mov     rcx, rdi
      shr     rcx, 56
      and     ecx, 15
      vpinsrb xmm1, xmm1, ecx, 14
      mov     rcx, rdi
      shr     rcx, 60
      and     dil, 15
      vpinsrb xmm2, xmm1, ecx, 15
      vpshufb xmm1, xmm1, xmmword ptr [rip + .LCPI0_2]
      vmovd   xmm3, edi
      vpandd  xmm1, xmm1, dword ptr [rip + .LCPI0_3]{1to4}
      vpunpckhbw      xmm2, xmm2, xmm2
      vpandd  xmm2, xmm2, dword ptr [rip + .LCPI0_6]{1to4}
      vpinsrb xmm3, xmm3, eax, 1
      vpmovdb xmm1, zmm1
      vpackuswb       xmm2, xmm2, xmm2
      vpmovqb xmm0, xmm0
      vpunpcklwd      xmm0, xmm3, xmm0
      vpunpckldq      xmm0, xmm0, xmm1
      vpunpcklqdq     xmm0, xmm0, xmm2
      vzeroupper
      ret

aarch64 (apple_latest)

expandU64To16xu4AsByteVector1:
        sub     sp, sp, #16
        ubfx    w8, w0, #4, #4
        and     w9, w0, #0xf
        fmov    s0, w9
        mov     v0.b[1], w8
        ubfx    w8, w0, #8, #4
        mov     v0.b[2], w8
        ubfx    w8, w0, #12, #4
        mov     v0.b[3], w8
        ubfx    w8, w0, #16, #4
        mov     v0.b[4], w8
        ubfx    w8, w0, #20, #4
        mov     v0.b[5], w8
        ubfx    w8, w0, #24, #4
        mov     v0.b[6], w8
        ubfx    x8, x0, #28, #4
        mov     v0.b[7], w8
        ubfx    x8, x0, #32, #4
        mov     v0.b[8], w8
        ubfx    x8, x0, #36, #4
        mov     v0.b[9], w8
        ubfx    x8, x0, #40, #4
        mov     v1.16b, v0.16b
        mov     v1.b[10], w8
        ubfx    x9, x0, #44, #4
        mov     v1.b[11], w9
        ubfx    x10, x0, #48, #4
        mov     v1.b[12], w10
        ubfx    x10, x0, #52, #4
        mov     v1.b[13], w10
        ubfx    x10, x0, #56, #4
        mov     v1.b[14], w10
        lsr     x10, x0, #60
        mov     v1.b[15], w10
        fmov    s2, w8
        mov     v2.s[1], w9
        movi    v3.2s, #15
        and     v2.8b, v2.8b, v3.8b
        ext     v1.16b, v1.16b, v1.16b, #8
        zip2    v1.8b, v1.8b, v0.8b
        bic     v1.4h, #255, lsl #8
        mov     v2.b[1], v2.b[4]
        mov     v0.h[5], v2.h[0]
        xtn     v1.8b, v1.8h
        mov     v0.s[3], v1.s[0]
        add     sp, sp, #16
        ret

You can coerce LLVM to produce efficient code for this operation, using one of the following routines.

export fn _2(x: u64) @Vector(16, u8) {
    const vec: @Vector(8, u8) = @bitCast(x);
    return @shuffle(
        u8,
        vec,
        vec >> @splat(4),
        std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
    ) & @as(@Vector(16, u8), @splat(0xF));
}

export fn _3(x: u64) @Vector(16, u8) {
    const vec: @Vector(8, u8) = @bitCast(x);
    return @shuffle(
        u8,
        vec & @as(@Vector(8, u8), @splat(0xF)),
        vec >> @splat(4),
        std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
    );
}

Here is the assembly on x86_64 (znver4):

.LCPI0_1:
        .zero   4,15
_2:
        vmovq   xmm0, rdi
        vpsrlw  xmm1, xmm0, 4
        vpunpcklbw      xmm0, xmm0, xmm1
        vpandd  xmm0, xmm0, dword ptr [rip + .LCPI0_1]{1to4}
        ret

.LCPI1_2:
        .zero   4,15
_3:
        vmovq   xmm0, rdi
        vpandd  xmm1, xmm0, dword ptr [rip + .LCPI1_2]{1to4}
        vpsrlw  xmm0, xmm0, 4
        vpandd  xmm0, xmm0, dword ptr [rip + .LCPI1_2]{1to4}
        vpunpcklbw      xmm0, xmm1, xmm0
        ret

Here is the assembly on aarch64 (apple_latest):

_2:
        fmov    d0, x0
        fmov    d1, x0
        ushr    v1.8b, v1.8b, #4
        zip1    v0.16b, v0.16b, v1.16b
        movi    v1.16b, #15
        and     v0.16b, v0.16b, v1.16b
        ret

_3:
        fmov    d0, x0
        movi    v1.8b, #15
        and     v1.8b, v0.8b, v1.8b
        ushr    v0.8b, v0.8b, #4
        zip1    v0.16b, v1.16b, v0.16b
        ret

Unfortunately, there is no clear winner here across architectures. _2 looks more efficient on x86_64 (znver4) and _3 looks more efficient on aarch64 (apple_latest), although I could be wrong. It is sort of unfortunate that LLVM does not take a side with the hand-rolled code and just compile all of my implementations in exactly the same way, but I would be happy with the simple and easy approach giving me optimal code.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions