Skip to content

[x86] Bad optimization of a multiply by a scalar register that came from a vector #78897

@Validark

Description

@Validark

Godbolt link
I have the following code:

const std = @import("std");

export fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
    const unique_bytes: @Vector(8, u8) = @bitCast(@as(u64, 0x8040_2010_0804_0201));
    const splatted = @as(@Vector(8, u8), @splat(x));
    const selector = (splatted & unique_bytes) != unique_bytes;
    const vec: u64 = @bitCast(@select(u8, selector, @as(@Vector(8, u8), @splat(0b00010001)), @as(@Vector(8, u8), @splat(0))));
    
    const prefix_sums1: @Vector(8, u8) = @bitCast(((vec) *% 0x1111111111111111) << 4);
    const prefix_sums2: @Vector(8, u8) = @bitCast(((vec ^ 0x1111111111111111) *% 0x1111111111111111) << 4);

    const interleaved_shuffle_vector = @select(u8, selector, prefix_sums1, prefix_sums2);

    return @shuffle(
        u8,
        interleaved_shuffle_vector << @splat(4) >> @splat(4),
        interleaved_shuffle_vector >> @splat(4),
        std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
    );
}

LLVM currently tries to optimize prefix_sums1 by operating directly on the vector we got vec from.

.LCPI0_0:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_1:
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .quad   1229782938247303440
        .quad   1229782938247303440
.LCPI0_3:
        .quad   286331153
        .quad   286331153
.LCPI0_4:
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_5:
        .zero   16,15
.LCPI0_6:
        .quad   1229782938247303440
produceShuffleVectorForByte:
        vmovd   xmm0, edi
        vpxor   xmm1, xmm1, xmm1
-       vpbroadcastq    xmm3, qword ptr [rip + .LCPI0_6]
        movabs  rcx, 76861433640456465
        vpbroadcastb    xmm0, xmm0
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        vpcmpeqb        xmm0, xmm0, xmm1
        vpand   xmm1, xmm0, xmmword ptr [rip + .LCPI0_1]
-       vpmuludq        xmm4, xmm1, xmmword ptr [rip + .LCPI0_3]
        vmovq   rax, xmm1
-       vpsrlq  xmm2, xmm1, 32
-       vpmuludq        xmm1, xmm1, xmm3
-       vpmuludq        xmm2, xmm2, xmm3
        xor     rcx, rax
        movabs  rax, 1229782938247303440
        imul    rax, rcx
-       vpaddq  xmm2, xmm4, xmm2
-       vpsllq  xmm2, xmm2, 32
        vmovq   xmm3, rax
-       vpaddq  xmm1, xmm1, xmm2
        vpblendvb       xmm0, xmm3, xmm1, xmm0
        vpand   xmm1, xmm0, xmmword ptr [rip + .LCPI0_4]
        vpsrlw  xmm0, xmm0, 4
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_5]
        vpunpcklbw      xmm0, xmm1, xmm0
        ret

The bad optimization is highlighted in red.

I tried sticking an xor with 1 into the prefix_sums1 calculation.

-   const prefix_sums1: @Vector(8, u8) = @bitCast(((vec) *% 0x1111111111111111) << 4);
+   const prefix_sums1: @Vector(8, u8) = @bitCast(((vec ^ 1) *% 0x1111111111111111) << 4);

Using that, the optimization was disabled and I got a much better emit. Here is the better assembly I got (without the extra xor with 1):

.LCPI0_0:
        .byte   1
        .byte   2
        .byte   4
        .byte   8
        .byte   16
        .byte   32
        .byte   64
        .byte   128
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_1:
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .byte   17
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_2:
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .byte   15
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
.LCPI0_3:
        .zero   16,15
produceShuffleVectorForByte:
        vmovd   xmm0, edi
        vpxor   xmm1, xmm1, xmm1
        movabs  rcx, 76861433640456465
        movabs  rdx, 1229782938247303440
        vpbroadcastb    xmm0, xmm0
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        vpcmpeqb        xmm0, xmm0, xmm1
        vpand   xmm1, xmm0, xmmword ptr [rip + .LCPI0_1]
        vmovq   rax, xmm1
        xor     rcx, rax
        imul    rax, rdx
        imul    rcx, rdx
        vmovq   xmm1, rax
        vmovq   xmm2, rcx
        vpblendvb       xmm0, xmm2, xmm1, xmm0
        vpand   xmm1, xmm0, xmmword ptr [rip + .LCPI0_2]
        vpsrlw  xmm0, xmm0, 4
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_3]
        vpunpcklbw      xmm0, xmm1, xmm0
        ret

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions