[RISCV] Support llvm.masked.expandload intrinsic #101954

wangpc-pp · 2024-08-05T10:19:25Z

We can use viota+vrgather to synthesize vdecompress and lower
expanding load to vcpop+load+vdecompress.

And if %mask is all ones, we can lower expanding load to a normal
unmasked load.

Fixes #101914.

llvmbot · 2024-08-05T10:19:57Z

@llvm/pr-subscribers-backend-risc-v

Author: Pengcheng Wang (wangpc-pp)

Changes

We can use viota+vrgather to synthesize vdecompress and lower
expanding load to vcpop+load+vdecompress.

Fixes #101914

Patch is 153.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101954.diff

6 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+30-7)
(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+10)
(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+2)
(added) llvm/test/CodeGen/RISCV/rvv/expandload.ll (+1541)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-fp.ll (+168-966)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll (+123-873)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9ee60b9db2837..67908c480fed3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10732,18 +10732,21 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
   SDValue Chain = MemSD->getChain();
   SDValue BasePtr = MemSD->getBasePtr();
 
-  SDValue Mask, PassThru, VL;
+  SDValue Mask, PassThru, LoadVL;
+  bool IsExpandingLoad = false;
   if (const auto *VPLoad = dyn_cast<VPLoadSDNode>(Op)) {
     Mask = VPLoad->getMask();
     PassThru = DAG.getUNDEF(VT);
-    VL = VPLoad->getVectorLength();
+    LoadVL = VPLoad->getVectorLength();
   } else {
     const auto *MLoad = cast<MaskedLoadSDNode>(Op);
     Mask = MLoad->getMask();
     PassThru = MLoad->getPassThru();
+    IsExpandingLoad = MLoad->isExpandingLoad();
   }
 
-  bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+  bool IsUnmasked =
+      ISD::isConstantSplatVectorAllOnes(Mask.getNode()) || IsExpandingLoad;
 
   MVT XLenVT = Subtarget.getXLenVT();
 
@@ -10751,14 +10754,22 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
   if (VT.isFixedLengthVector()) {
     ContainerVT = getContainerForFixedLengthVector(VT);
     PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
-    if (!IsUnmasked) {
+    if (!IsUnmasked || IsExpandingLoad) {
       MVT MaskVT = getMaskTypeFor(ContainerVT);
       Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
     }
   }
 
-  if (!VL)
-    VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+  if (!LoadVL)
+    LoadVL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+
+  SDValue ExpandingVL;
+  if (IsExpandingLoad) {
+    ExpandingVL = LoadVL;
+    LoadVL = DAG.getNode(
+        RISCVISD::VCPOP_VL, DL, XLenVT, Mask,
+        getAllOnesMask(Mask.getSimpleValueType(), LoadVL, DL, DAG), LoadVL);
+  }
 
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask;
@@ -10770,7 +10781,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
   Ops.push_back(BasePtr);
   if (!IsUnmasked)
     Ops.push_back(Mask);
-  Ops.push_back(VL);
+  Ops.push_back(LoadVL);
   if (!IsUnmasked)
     Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
 
@@ -10779,6 +10790,18 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
   SDValue Result =
       DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
   Chain = Result.getValue(1);
+  if (IsExpandingLoad) {
+    MVT IotaVT = ContainerVT;
+    if (ContainerVT.isFloatingPoint())
+      IotaVT = ContainerVT.changeVectorElementTypeToInteger();
+
+    SDValue Iota =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IotaVT,
+                    DAG.getConstant(Intrinsic::riscv_viota, DL, XLenVT),
+                    DAG.getUNDEF(IotaVT), Mask, ExpandingVL);
+    Result = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ContainerVT, Result,
+                         Iota, PassThru, Mask, ExpandingVL);
+  }
 
   if (VT.isFixedLengthVector())
     Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4cd904c039a98..41b39accbf027 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1966,6 +1966,16 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
 
+bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) {
+  auto *VTy = dyn_cast<VectorType>(DataTy);
+  if (!VTy || VTy->isScalableTy())
+    return false;
+
+  if (!isLegalMaskedLoadStore(DataTy, Alignment))
+    return false;
+  return true;
+}
+
 bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
   auto *VTy = dyn_cast<VectorType>(DataTy);
   if (!VTy || VTy->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 9c37a4f6ec2d0..192bb35613aad 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -281,6 +281,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
   }
 
+  bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment);
+
   bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
 
   bool isVScaleKnownToBeAPowerOfTwo() const {
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
new file mode 100644
index 0000000000000..9380b52daf4b7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -0,0 +1,1541 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV64
+; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV32
+
+; Compress + store for i16 type
+
+define <1 x i8> @test_expandload_v1i8(ptr %base, <1 x i1> %mask, <1 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v1i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT:    vle8.v v9, (a0)
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
+; RV64-NEXT:    viota.m v10, v0
+; RV64-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v1i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT:    vle8.v v9, (a0)
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, mu
+; RV32-NEXT:    viota.m v10, v0
+; RV32-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT:    ret
+  %res = call <1 x i8> @llvm.masked.expandload.v1i8(ptr align 1 %base, <1 x i1> %mask, <1 x i8> %passthru)
+  ret <1 x i8> %res
+}
+
+define <2 x i8> @test_expandload_v2i8(ptr %base, <2 x i1> %mask, <2 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT:    vle8.v v9, (a0)
+; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; RV64-NEXT:    viota.m v10, v0
+; RV64-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT:    vle8.v v9, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, mu
+; RV32-NEXT:    viota.m v10, v0
+; RV32-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT:    ret
+  %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr align 1 %base, <2 x i1> %mask, <2 x i8> %passthru)
+  ret <2 x i8> %res
+}
+
+define <4 x i8> @test_expandload_v4i8(ptr %base, <4 x i1> %mask, <4 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v4i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT:    vle8.v v9, (a0)
+; RV64-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; RV64-NEXT:    viota.m v10, v0
+; RV64-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v4i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT:    vle8.v v9, (a0)
+; RV32-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; RV32-NEXT:    viota.m v10, v0
+; RV32-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT:    ret
+  %res = call <4 x i8> @llvm.masked.expandload.v4i8(ptr align 1 %base, <4 x i1> %mask, <4 x i8> %passthru)
+  ret <4 x i8> %res
+}
+
+define <8 x i8> @test_expandload_v8i8(ptr %base, <8 x i1> %mask, <8 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v8i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT:    vle8.v v9, (a0)
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV64-NEXT:    viota.m v10, v0
+; RV64-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v8i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT:    vle8.v v9, (a0)
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; RV32-NEXT:    viota.m v10, v0
+; RV32-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT:    ret
+  %res = call <8 x i8> @llvm.masked.expandload.v8i8(ptr align 1 %base, <8 x i1> %mask, <8 x i8> %passthru)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_expandload_v16i8(ptr %base, <16 x i1> %mask, <16 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v16i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vcpop.m a1, v0
+; RV64-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT:    vle8.v v9, (a0)
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV64-NEXT:    viota.m v10, v0
+; RV64-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v16i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vcpop.m a1, v0
+; RV32-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT:    vle8.v v9, (a0)
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; RV32-NEXT:    viota.m v10, v0
+; RV32-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT:    ret
+  %res = call <16 x i8> @llvm.masked.expandload.v16i8(ptr align 1 %base, <16 x i1> %mask, <16 x i8> %passthru)
+  ret <16 x i8> %res
+}
+
+define <32 x i8> @test_expandload_v32i8(ptr %base, <32 x i1> %mask, <32 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v32i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 32
+; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT:    vcpop.m a2, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; RV64-NEXT:    vle8.v v10, (a0)
+; RV64-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; RV64-NEXT:    viota.m v12, v0
+; RV64-NEXT:    vrgather.vv v8, v10, v12, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v32i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; RV32-NEXT:    vle8.v v10, (a0)
+; RV32-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; RV32-NEXT:    viota.m v12, v0
+; RV32-NEXT:    vrgather.vv v8, v10, v12, v0.t
+; RV32-NEXT:    ret
+  %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> %mask, <32 x i8> %passthru)
+  ret <32 x i8> %res
+}
+
+define <64 x i8> @test_expandload_v64i8(ptr %base, <64 x i1> %mask, <64 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v64i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 64
+; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT:    vcpop.m a2, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; RV64-NEXT:    vle8.v v12, (a0)
+; RV64-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; RV64-NEXT:    viota.m v16, v0
+; RV64-NEXT:    vrgather.vv v8, v12, v16, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v64i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 64
+; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; RV32-NEXT:    vle8.v v12, (a0)
+; RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; RV32-NEXT:    viota.m v16, v0
+; RV32-NEXT:    vrgather.vv v8, v12, v16, v0.t
+; RV32-NEXT:    ret
+  %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> %mask, <64 x i8> %passthru)
+  ret <64 x i8> %res
+}
+
+define <128 x i8> @test_expandload_v128i8(ptr %base, <128 x i1> %mask, <128 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v128i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 128
+; RV64-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT:    vcpop.m a2, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v16, (a0)
+; RV64-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; RV64-NEXT:    viota.m v24, v0
+; RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v128i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 128
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT:    vcpop.m a2, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v16, (a0)
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; RV32-NEXT:    viota.m v24, v0
+; RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; RV32-NEXT:    ret
+  %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> %mask, <128 x i8> %passthru)
+  ret <128 x i8> %res
+}
+
+define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v256i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    li a3, 24
+; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v7, v8
+; RV64-NEXT:    li a2, 128
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v8, (a1)
+; RV64-NEXT:    addi a1, sp, 16
+; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v9, v0, 1
+; RV64-NEXT:    vmv.x.s a1, v9
+; RV64-NEXT:    vmv.x.s a3, v0
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcpop.m a4, v0
+; RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v24, (a0)
+; RV64-NEXT:    csrr a4, vlenb
+; RV64-NEXT:    slli a4, a4, 4
+; RV64-NEXT:    add a4, sp, a4
+; RV64-NEXT:    addi a4, a4, 16
+; RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT:    vcpop.m a4, v7
+; RV64-NEXT:    cpop a3, a3
+; RV64-NEXT:    cpop a1, a1
+; RV64-NEXT:    add a0, a0, a3
+; RV64-NEXT:    add a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT:    vle8.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
+; RV64-NEXT:    viota.m v16, v0
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    viota.m v16, v7
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v7
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 4
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; RV64-NEXT:    vmv.v.v v16, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 24
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: test_expandload_v256i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v7, v8
+; RV32-NEXT:    li a2, 128
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v0, 1
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsrl.vx v10, v9, a1
+; RV32-NEXT:    vmv.x.s a3, v10
+; RV32-NEXT:    vsrl.vx v10, v0, a1
+; RV32-NEXT:    vmv.x.s a1, v10
+; RV32-NEXT:    vmv.x.s a4, v9
+; RV32-NEXT:    vmv.x.s a5, v0
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vcpop.m a6, v0
+; RV32-NEXT:    vsetvli zero, a6, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v8, (a0)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    slli a6, a6, 4
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 16
+; RV32-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT:    cpop a1, a1
+; RV32-NEXT:    cpop a5, a5
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    cpop a3, a3
+; RV32-NEXT:    cpop a4, a4
+; RV32-NEXT:    add a3, a4, a3
+; RV32-NEXT:    add a1, a1, a3
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT:    vcpop.m a1, v7
+; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT:    vle8.v v8, (a0)
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
+; RV32-NEXT:    viota.m v16, v0
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 24
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    viota.m v16, v7
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv1r.v v0, v7
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, ...
[truncated]

llvm/test/CodeGen/RISCV/rvv/expandload.ll

lukel97 · 2024-08-05T10:39:25Z

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-expandload-int.ll

+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    viota.m v10, v0
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t


Can this be done with a viota.m and an indexed load instead?

; v0 = 0 0 0 1 0 1 0 1 viota.m v9, v0 ; v9 = 3 3 3 2 2 1 1 0 vloxi64.v v8, a0, v9, v0.t ; v8 = v8.7, v8.6, v8.5, [a0+2], v8.3, [a0+1], v8.1, [a0+0]

Good point! Done!
Both indexed load and vrgather are slow on current implementations, using indexed load has fewer instructions. And indexed load can be optimized in μ-arch level, but it may not be easy for vrgather.

If anything, I'd imagine vrgather would be easier for hardware to optimize (though I am not a hardware designer), especially for smaller element types; I don't know of any implementation of any architecture where indexed loads are anything better than one memory request per element (i.e. ~3 elements/cycle at best on already very high-end cores), but 16×i8 shuffles are usually able to run at at least 1/cycle (even down to ARM A53; and modern x86-64 cores (i.e. those with 3 loads/cycle) can even do 512-bit shuffles at 1/cycle). LMUL makes things somewhat more complicated, but I'd hope general-purpose cores would be designed in a way that doesn't make vrgather literally never beneficial to use.

If preferring indexed loads over load+vrgather was desired, it should also be done for, say, a reversing loop, but it's not (https://godbolt.org/z/5rPW6aoxq)

A vrgather.vv is likely to be O(N^2) in terms of LMUL (since in the worst case scenario each register written to reads from LMUL registers), but vloxiN.v is O(N) in terms of VL. Or at least that's what we cost in RISCVTargetTransformInfo anyway, so we might as well be consistent there.

@dzaima Thanks! Good information for other architectures!

For vrgather.vv, you can check https://camel-cdr.github.io/rvv-bench-results/index.html and see the costs of current implementations.
For indexed loads/stores, if they are consecutive (which is the case for this patch), these loads/stores can be combined in μ-arch.

If preferring indexed loads over load+vrgather was desired, it should also be done for, say, a reversing loop, but it's not (https://godbolt.org/z/5rPW6aoxq)

For reversed loop, we would prefer strided load with nagative stride, here we just haven't optimized for this case in vectorizer.

@wangpc-pp BPI-F3 there for vrgather at e8m1 can process eight elements in a cycle, and four at e16m1. Milk-V Pioneer at e8m1 can do 32 elements per cycle, e16m1 - 16 elts/cycle, e32m1 - 8 elts/cycle, and even at e64m1 four elements per cycle (yes, that's XTheadVector not RVV1.0, but IIRC an equivalent RVV1.0 architecture is supposed to come out at some point). These cases should easily exceed the throughput of indexed loads on these architectures, even though the data on those aren't present. (things indeed get less pretty at higher LMUL though; I suppose cost models would be useful here)

For the two RVV1.0 architectures, there's a byteswap benchmark, and the top performance is with vrgather. (though, indexed loads aren't even there; I suppose camel-cdr either tested it and it was awful, or just automatically assumed it'd be significantly worse)

Theoretically future architectures could check the index list of indexed loads/stores and special-case patterns, but I haven't heard of any RVV implementations doing such, and, seeing as x86-64 has had indexed loads for a decade (and SVE to a smaller extent) but nothing like this optimization, I wouldn't bet on that becoming widespread. And even if it was optimized, it would have to be doing something equivalent to vrgather internally anyway.

In general, every vle*.v + vrgather.vv can be turned into a vluxei*.v (if not a vlse*.v), which could be done as a general optimization if desired.

These cases should easily exceed the throughput of indexed loads on these architectures

vrgather.vv doesn't perform the load though so I'm not sure if we can compare them directly. The vluxei*.v is kinda doing two in one.

I think the big performance concern is LMUL > 1, according to https://camel-cdr.github.io/rvv-bench-results/bpi_f3/index.html it's 16 cycles at e8m2 and 64 at e8m4 on the BPI-F3. The loop vectorizer uses LMUL 2 by default, if it ever learns to emit expanded loads.

Thanks for the discussion! I think we may leave a TODO here to investigate which way is better:

vcpop.m+vleN.v+viota.m+vrgather.vv

vsetivli zero, 16, e8, m1, ta, ma vcpop.m a1, v0 vsetvli zero, a1, e32, m4, ta, ma vle32.v v12, (a0) vsetivli zero, 16, e32, m4, ta, mu viota.m v16, v0 vrgather.vv v8, v12, v16, v0.t ret

It has:

Consecutive unit-stride load, which is fast.

Larger register pressure. We need to retain v12+v16 for above example.

More vtype toggles (2).

vrgather.vv may be slow for large LMUL.

viota.m+vsll.vi(optional)+vluxeiN.v

vsetivli zero, 16, e32, m4, ta, ma viota.m v12, v0 vsll.vi v12, v12, 2, v0.t vsetvli zero, zero, e32, m4, ta, mu vluxei32.v v8, (a0), v12, v0.t ret

It has:

Indexed load, which is slower.

Small register pressure. We only use v12 for above example.

Fewer instructions (2 or 3) and vtype toggles (only 1).

Currently I prefer the second, but I don't have performance data to support this.

Yeah, I didn't add the index load/store to byteswap, because I saw it perform badly in the LUT4 benchmark, and you can already infer the performance from that.

I think the optimal implementation for a reverse would be LMUL=8 (or 4 if we want to be conservative) uni-stride load/store + 8/4 times unrolled LMUL=1 vrgather.vv.
The byteswap benchmark is basically the same case.

You can express a LMUL>1 vrgather operation that doesn't need to cross lanes, in multiple LMUL=1 vrgather operations, which gives you a O(N) scale with LMUL for vrgather.
I've only seen a single case where indexed loads beat vrgather, it's not by much (20%), and only once we are memory bound, otherwise vrgather is much faster (up to 4x): https://camel-cdr.github.io/rvv-bench-results/bpi_f3/LUT4.html
I presume this is caused by the particular processor supporting chaining for indexed loads, but not for vrgather.

Running some experiments on the Banana Pi F3, an expanded load via vrgather.vv is indeed about 4x faster at LMUL 1 and still about 2x faster at LMUL 4. At LMUL 8 the difference seems to level off.

Given this is going to be specific to the microarchitecture maybe we should have a subtarget feature to use vrgather.vv here, enabled by default for the spacemit-k1. I would like to be optimistic that at some point there will be hardware with a faster vluxei*.v implementation.

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/test/CodeGen/RISCV/rvv/expandload.ll

lukel97 · 2024-08-06T11:13:23Z

llvm/test/CodeGen/RISCV/rvv/expandload.ll

+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    viota.m v12, v0
+; CHECK-NEXT:    vsll.vi v12, v12, 1, v0.t


Not for this PR, but as a follow up it would be good to narrow the LMUL of the indices to reduce register pressure

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

lukel97

LGTM

github-actions · 2024-10-23T09:10:33Z

✅ With the latest revision this PR passed the C/C++ code formatter.

wangpc-pp · 2024-10-23T09:15:27Z

@lukel97 I have added a subtarget feature to indicate whether indexed load/store instructions are optimal.
Sorry for delaying for a long time, can you please take a look at the top commit?

llvm/lib/Target/RISCV/RISCVFeatures.td

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

preames · 2024-10-30T14:50:16Z

LGTM - but please wait a bit in case Luke or Craig have remaining comments. I think everything has been addressed, but maybe I missed something.

Please remember to update the review description before merging as that becomes the submission comment.

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.

llvmbot added the backend:RISC-V label Aug 5, 2024

wangpc-pp requested review from arcbbb and preames and removed request for arcbbb August 5, 2024 10:19

wangpc-pp requested review from lukel97, arcbbb, preames, topperc and yetingk and removed request for preames August 5, 2024 10:19

lukel97 reviewed Aug 5, 2024

View reviewed changes

wangpc-pp force-pushed the main-rvv-expand-load branch from aa662d7 to c3098af Compare August 5, 2024 13:06

wangpc-pp requested a review from lukel97 August 5, 2024 13:12

lukel97 reviewed Aug 5, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

wangpc-pp force-pushed the main-rvv-expand-load branch from c3098af to c769a69 Compare August 6, 2024 07:13

topperc reviewed Aug 6, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

lukel97 reviewed Aug 6, 2024

View reviewed changes

topperc reviewed Aug 6, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

topperc reviewed Aug 6, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

lukel97 approved these changes Aug 7, 2024

View reviewed changes

wangpc-pp force-pushed the main-rvv-expand-load branch from 103a63c to 38b0662 Compare October 23, 2024 09:07

wangpc-pp force-pushed the main-rvv-expand-load branch from 38b0662 to 8288bd6 Compare October 23, 2024 09:11

topperc reviewed Oct 29, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVFeatures.td Outdated Show resolved Hide resolved

topperc reviewed Oct 29, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

lukel97 approved these changes Oct 30, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

topperc reviewed Oct 30, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Show resolved Hide resolved

[RISCV] Support llvm.masked.expandload intrinsic

f14ae55

We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.

wangpc-pp force-pushed the main-rvv-expand-load branch from 0e35b2c to f14ae55 Compare October 31, 2024 10:39

wangpc-pp merged commit 18f0f70 into llvm:main Oct 31, 2024
8 checks passed

wangpc-pp deleted the main-rvv-expand-load branch October 31, 2024 12:04

[RISCV] Support llvm.masked.expandload intrinsic #101954

[RISCV] Support llvm.masked.expandload intrinsic #101954

Uh oh!

Conversation

wangpc-pp commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Aug 5, 2024

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dzaima Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dzaima Aug 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lukel97 Aug 7, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Oct 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

wangpc-pp commented Oct 23, 2024

Uh oh!

Uh oh!

Uh oh!

preames commented Oct 30, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

wangpc-pp commented Aug 5, 2024 •

edited

Loading

dzaima Aug 5, 2024 •

edited

Loading

dzaima Aug 6, 2024 •

edited

Loading

lukel97 Aug 7, 2024 •

edited

Loading

github-actions bot commented Oct 23, 2024 •

edited

Loading