-
Notifications
You must be signed in to change notification settings - Fork 14.6k
Closed
Closed
Copy link
Labels
Description
Godbolt link
I have the following code:
const std = @import("std");
export fn produceShuffleVectorForByte(x: u8) @Vector(16, u8) {
const unique_bytes: @Vector(8, u8) = @bitCast(@as(u64, 0x8040_2010_0804_0201));
const splatted = @as(@Vector(8, u8), @splat(x));
const selector = (splatted & unique_bytes) != unique_bytes;
const vec: u64 = @bitCast(@select(u8, selector, @as(@Vector(8, u8), @splat(0b00010001)), @as(@Vector(8, u8), @splat(0))));
const prefix_sums1: @Vector(8, u8) = @bitCast(((vec) *% 0x1111111111111111) << 4);
const prefix_sums2: @Vector(8, u8) = @bitCast(((vec ^ 0x1111111111111111) *% 0x1111111111111111) << 4);
const interleaved_shuffle_vector = @select(u8, selector, prefix_sums1, prefix_sums2);
return @shuffle(
u8,
interleaved_shuffle_vector << @splat(4) >> @splat(4),
interleaved_shuffle_vector >> @splat(4),
std.simd.interlace(.{ std.simd.iota(i32, 8), ~std.simd.iota(i32, 8) }),
);
}
LLVM currently tries to optimize prefix_sums1 by operating directly on the vector we got vec
from.
.LCPI0_0:
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_1:
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.quad 1229782938247303440
.quad 1229782938247303440
.LCPI0_3:
.quad 286331153
.quad 286331153
.LCPI0_4:
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_5:
.zero 16,15
.LCPI0_6:
.quad 1229782938247303440
produceShuffleVectorForByte:
vmovd xmm0, edi
vpxor xmm1, xmm1, xmm1
- vpbroadcastq xmm3, qword ptr [rip + .LCPI0_6]
movabs rcx, 76861433640456465
vpbroadcastb xmm0, xmm0
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
vpcmpeqb xmm0, xmm0, xmm1
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_1]
- vpmuludq xmm4, xmm1, xmmword ptr [rip + .LCPI0_3]
vmovq rax, xmm1
- vpsrlq xmm2, xmm1, 32
- vpmuludq xmm1, xmm1, xmm3
- vpmuludq xmm2, xmm2, xmm3
xor rcx, rax
movabs rax, 1229782938247303440
imul rax, rcx
- vpaddq xmm2, xmm4, xmm2
- vpsllq xmm2, xmm2, 32
vmovq xmm3, rax
- vpaddq xmm1, xmm1, xmm2
vpblendvb xmm0, xmm3, xmm1, xmm0
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_4]
vpsrlw xmm0, xmm0, 4
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_5]
vpunpcklbw xmm0, xmm1, xmm0
ret
The bad optimization is highlighted in red.
I tried sticking an xor with 1 into the prefix_sums1
calculation.
- const prefix_sums1: @Vector(8, u8) = @bitCast(((vec) *% 0x1111111111111111) << 4);
+ const prefix_sums1: @Vector(8, u8) = @bitCast(((vec ^ 1) *% 0x1111111111111111) << 4);
Using that, the optimization was disabled and I got a much better emit. Here is the better assembly I got (without the extra xor with 1):
.LCPI0_0:
.byte 1
.byte 2
.byte 4
.byte 8
.byte 16
.byte 32
.byte 64
.byte 128
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_1:
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.byte 17
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_2:
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_3:
.zero 16,15
produceShuffleVectorForByte:
vmovd xmm0, edi
vpxor xmm1, xmm1, xmm1
movabs rcx, 76861433640456465
movabs rdx, 1229782938247303440
vpbroadcastb xmm0, xmm0
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
vpcmpeqb xmm0, xmm0, xmm1
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_1]
vmovq rax, xmm1
xor rcx, rax
imul rax, rdx
imul rcx, rdx
vmovq xmm1, rax
vmovq xmm2, rcx
vpblendvb xmm0, xmm2, xmm1, xmm0
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_2]
vpsrlw xmm0, xmm0, 4
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_3]
vpunpcklbw xmm0, xmm1, xmm0
ret