[RISC-V] `expandload` should compile to `viota`+`vrgather` #101914

Validark · 2024-08-05T00:58:37Z

export fn expandload16(a: *const [16]u8, b: u16, c: @Vector(16, u8)) @Vector(16, u8) {
    return struct {
        extern fn @"llvm.masked.expandload.v16i8"(@TypeOf(a), @Vector(16, u1), @TypeOf(c)) callconv(.Unspecified) @Vector(16, u8);
    }.@"llvm.masked.expandload.v16i8"(a, @as(@Vector(16, u1), @bitCast(b)), c);
}

define dso_local <16 x i8> @expandload16(ptr nocapture nonnull readonly align 1 %0, i16 zeroext %1, <16 x i8> %2) local_unnamed_addr {
Entry:
  %3 = bitcast i16 %1 to <16 x i1>
  %4 = tail call fastcc <16 x i8> @llvm.masked.expandload.v16i8(ptr nonnull readonly align 1 %0, <16 x i1> %3, <16 x i8> %2)
  ret <16 x i8> %4
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

declare fastcc <16 x i8> @llvm.masked.expandload.v16i8(ptr nocapture, <16 x i1>, <16 x i8>) #2

When compiled for the Sifive x280, we check bit-by-bit and jump based on that:

...
...
...
.LBB0_2:
        andi    a2, a1, 4
        bnez    a2, .LBB0_20
.LBB0_3:
        andi    a2, a1, 8
        bnez    a2, .LBB0_21
.LBB0_4:
        andi    a2, a1, 16
        bnez    a2, .LBB0_22
.LBB0_5:
        andi    a2, a1, 32
        bnez    a2, .LBB0_23
.LBB0_6:
        andi    a2, a1, 64
        bnez    a2, .LBB0_24
.LBB0_7:
        andi    a2, a1, 128
        bnez    a2, .LBB0_25
.LBB0_8:
        andi    a2, a1, 256
        bnez    a2, .LBB0_26
.LBB0_9:
        andi    a2, a1, 512
        bnez    a2, .LBB0_27
.LBB0_10:
        andi    a2, a1, 1024
        bnez    a2, .LBB0_28
...
...
...

It should be able to work according to the documentation here:

https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#1651-synthesizing-vdecompress

llvmbot · 2024-08-05T04:05:14Z

@llvm/issue-subscribers-backend-risc-v

Author: Niles Salter (Validark)

```zig export fn expandload16(a: *const [16]u8, b: u16, c: @Vector(16, u8)) @Vector(16, u8) { return struct { extern fn @"llvm.masked.expandload.v16i8"(@TypeOf(a), @Vector(16, u1), @TypeOf(c)) callconv(.Unspecified) @Vector(16, u8); }.@"llvm.masked.expandload.v16i8"(a, @as(@Vector(16, u1), @bitCast(b)), c); } ```

define dso_local &lt;16 x i8&gt; @<!-- -->expandload16(ptr nocapture nonnull readonly align 1 %0, i16 zeroext %1, &lt;16 x i8&gt; %2) local_unnamed_addr {
Entry:
  %3 = bitcast i16 %1 to &lt;16 x i1&gt;
  %4 = tail call fastcc &lt;16 x i8&gt; @<!-- -->llvm.masked.expandload.v16i8(ptr nonnull readonly align 1 %0, &lt;16 x i1&gt; %3, &lt;16 x i8&gt; %2)
  ret &lt;16 x i8&gt; %4
}

declare void @<!-- -->llvm.dbg.value(metadata, metadata, metadata) #<!-- -->1

declare fastcc &lt;16 x i8&gt; @<!-- -->llvm.masked.expandload.v16i8(ptr nocapture, &lt;16 x i1&gt;, &lt;16 x i8&gt;) #<!-- -->2

When compiled for the Sifive x280, we check bit-by-bit and jump based on that:

...
...
...
.LBB0_2:
        andi    a2, a1, 4
        bnez    a2, .LBB0_20
.LBB0_3:
        andi    a2, a1, 8
        bnez    a2, .LBB0_21
.LBB0_4:
        andi    a2, a1, 16
        bnez    a2, .LBB0_22
.LBB0_5:
        andi    a2, a1, 32
        bnez    a2, .LBB0_23
.LBB0_6:
        andi    a2, a1, 64
        bnez    a2, .LBB0_24
.LBB0_7:
        andi    a2, a1, 128
        bnez    a2, .LBB0_25
.LBB0_8:
        andi    a2, a1, 256
        bnez    a2, .LBB0_26
.LBB0_9:
        andi    a2, a1, 512
        bnez    a2, .LBB0_27
.LBB0_10:
        andi    a2, a1, 1024
        bnez    a2, .LBB0_28
...
...
...

It should be able to work according to the documentation here:

https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#1651-synthesizing-vdecompress

We can use `iota+vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop+load+vdecompress`. Fixes llvm#101914

We can use `viota.m` + indexed load to synthesize expanding load: ``` %res = llvm.masked.expandload(%ptr, %mask, %passthru) -> %index = viota %mask if elt_size > 8: %index = vsll.vi %index, log2(elt_size), %mask %res = vluxei<n> %passthru, %ptr, %index, %mask ``` And if `%mask` is all ones, we can lower expanding load to an normal unmasked load. Fixes llvm#101914

We can use `viota.m` + indexed load to synthesize expanding load: ``` %res = llvm.masked.expandload(%ptr, %mask, %passthru) -> %index = viota %mask if elt_size > 8: %index = vsll.vi %index, log2(elt_size), %mask %res = vluxei<n> %passthru, %ptr, %index, %mask ``` And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914

We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.

github-actions bot added the new issue label Aug 5, 2024

EugeneZelenko added backend:RISC-V and removed new issue labels Aug 5, 2024

wangpc-pp self-assigned this Aug 5, 2024

wangpc-pp mentioned this issue Aug 5, 2024

[RISCV] Support llvm.masked.expandload intrinsic #101954

Merged

wangpc-pp added a commit to wangpc-pp/llvm-project that referenced this issue Aug 5, 2024

[RISCV] Support llvm.masked.expandload intrinsic

aa662d7

We can use `iota+vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop+load+vdecompress`. Fixes llvm#101914

wangpc-pp closed this as completed in #101954 Oct 31, 2024

wangpc-pp closed this as completed in 18f0f70 Oct 31, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISC-V] `expandload` should compile to `viota`+`vrgather` #101914

[RISC-V] `expandload` should compile to `viota`+`vrgather` #101914

Validark commented Aug 5, 2024

llvmbot commented Aug 5, 2024

Uh oh!

[RISC-V] expandload should compile to viota+vrgather #101914

[RISC-V] expandload should compile to viota+vrgather #101914

Comments

Validark commented Aug 5, 2024

llvmbot commented Aug 5, 2024

Uh oh!

[RISC-V] `expandload` should compile to `viota`+`vrgather` #101914

[RISC-V] `expandload` should compile to `viota`+`vrgather` #101914