-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Support llvm.masked.expandload intrinsic #101954
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Pengcheng Wang (wangpc-pp) ChangesWe can use Fixes #101914 Patch is 153.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101954.diff 6 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9ee60b9db2837..67908c480fed3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10732,18 +10732,21 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
SDValue Chain = MemSD->getChain();
SDValue BasePtr = MemSD->getBasePtr();
- SDValue Mask, PassThru, VL;
+ SDValue Mask, PassThru, LoadVL;
+ bool IsExpandingLoad = false;
if (const auto *VPLoad = dyn_cast<VPLoadSDNode>(Op)) {
Mask = VPLoad->getMask();
PassThru = DAG.getUNDEF(VT);
- VL = VPLoad->getVectorLength();
+ LoadVL = VPLoad->getVectorLength();
} else {
const auto *MLoad = cast<MaskedLoadSDNode>(Op);
Mask = MLoad->getMask();
PassThru = MLoad->getPassThru();
+ IsExpandingLoad = MLoad->isExpandingLoad();
}
- bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+ bool IsUnmasked =
+ ISD::isConstantSplatVectorAllOnes(Mask.getNode()) || IsExpandingLoad;
MVT XLenVT = Subtarget.getXLenVT();
@@ -10751,14 +10754,22 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
if (VT.isFixedLengthVector()) {
ContainerVT = getContainerForFixedLengthVector(VT);
PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget);
- if (!IsUnmasked) {
+ if (!IsUnmasked || IsExpandingLoad) {
MVT MaskVT = getMaskTypeFor(ContainerVT);
Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
}
}
- if (!VL)
- VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ if (!LoadVL)
+ LoadVL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+
+ SDValue ExpandingVL;
+ if (IsExpandingLoad) {
+ ExpandingVL = LoadVL;
+ LoadVL = DAG.getNode(
+ RISCVISD::VCPOP_VL, DL, XLenVT, Mask,
+ getAllOnesMask(Mask.getSimpleValueType(), LoadVL, DL, DAG), LoadVL);
+ }
unsigned IntID =
IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask;
@@ -10770,7 +10781,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
Ops.push_back(BasePtr);
if (!IsUnmasked)
Ops.push_back(Mask);
- Ops.push_back(VL);
+ Ops.push_back(LoadVL);
if (!IsUnmasked)
Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
@@ -10779,6 +10790,18 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
SDValue Result =
DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MemVT, MMO);
Chain = Result.getValue(1);
+ if (IsExpandingLoad) {
+ MVT IotaVT = ContainerVT;
+ if (ContainerVT.isFloatingPoint())
+ IotaVT = ContainerVT.changeVectorElementTypeToInteger();
+
+ SDValue Iota =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IotaVT,
+ DAG.getConstant(Intrinsic::riscv_viota, DL, XLenVT),
+ DAG.getUNDEF(IotaVT), Mask, ExpandingVL);
+ Result = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, ContainerVT, Result,
+ Iota, PassThru, Mask, ExpandingVL);
+ }
if (VT.isFixedLengthVector())
Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4cd904c039a98..41b39accbf027 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1966,6 +1966,16 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+bool RISCVTTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) {
+ auto *VTy = dyn_cast<VectorType>(DataTy);
+ if (!VTy || VTy->isScalableTy())
+ return false;
+
+ if (!isLegalMaskedLoadStore(DataTy, Alignment))
+ return false;
+ return true;
+}
+
bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) {
auto *VTy = dyn_cast<VectorType>(DataTy);
if (!VTy || VTy->isScalableTy())
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 9c37a4f6ec2d0..192bb35613aad 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -281,6 +281,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
}
+ bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment);
+
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
bool isVScaleKnownToBeAPowerOfTwo() const {
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
new file mode 100644
index 0000000000000..9380b52daf4b7
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -0,0 +1,1541 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV64
+; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d,+m,+zbb %s -o - | FileCheck %s --check-prefix=RV32
+
+; Compress + store for i16 type
+
+define <1 x i8> @test_expandload_v1i8(ptr %base, <1 x i1> %mask, <1 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v1i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vle8.v v9, (a0)
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
+; RV64-NEXT: viota.m v10, v0
+; RV64-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v1i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vle8.v v9, (a0)
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, mu
+; RV32-NEXT: viota.m v10, v0
+; RV32-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT: ret
+ %res = call <1 x i8> @llvm.masked.expandload.v1i8(ptr align 1 %base, <1 x i1> %mask, <1 x i8> %passthru)
+ ret <1 x i8> %res
+}
+
+define <2 x i8> @test_expandload_v2i8(ptr %base, <2 x i1> %mask, <2 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vle8.v v9, (a0)
+; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
+; RV64-NEXT: viota.m v10, v0
+; RV64-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vle8.v v9, (a0)
+; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, mu
+; RV32-NEXT: viota.m v10, v0
+; RV32-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT: ret
+ %res = call <2 x i8> @llvm.masked.expandload.v2i8(ptr align 1 %base, <2 x i1> %mask, <2 x i8> %passthru)
+ ret <2 x i8> %res
+}
+
+define <4 x i8> @test_expandload_v4i8(ptr %base, <4 x i1> %mask, <4 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v4i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vle8.v v9, (a0)
+; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
+; RV64-NEXT: viota.m v10, v0
+; RV64-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v4i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vle8.v v9, (a0)
+; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu
+; RV32-NEXT: viota.m v10, v0
+; RV32-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT: ret
+ %res = call <4 x i8> @llvm.masked.expandload.v4i8(ptr align 1 %base, <4 x i1> %mask, <4 x i8> %passthru)
+ ret <4 x i8> %res
+}
+
+define <8 x i8> @test_expandload_v8i8(ptr %base, <8 x i1> %mask, <8 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v8i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vle8.v v9, (a0)
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; RV64-NEXT: viota.m v10, v0
+; RV64-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v8i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vle8.v v9, (a0)
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
+; RV32-NEXT: viota.m v10, v0
+; RV32-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT: ret
+ %res = call <8 x i8> @llvm.masked.expandload.v8i8(ptr align 1 %base, <8 x i1> %mask, <8 x i8> %passthru)
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @test_expandload_v16i8(ptr %base, <16 x i1> %mask, <16 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v16i8:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT: vle8.v v9, (a0)
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV64-NEXT: viota.m v10, v0
+; RV64-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v16i8:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT: vle8.v v9, (a0)
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu
+; RV32-NEXT: viota.m v10, v0
+; RV32-NEXT: vrgather.vv v8, v9, v10, v0.t
+; RV32-NEXT: ret
+ %res = call <16 x i8> @llvm.masked.expandload.v16i8(ptr align 1 %base, <16 x i1> %mask, <16 x i8> %passthru)
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @test_expandload_v32i8(ptr %base, <32 x i1> %mask, <32 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v32i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT: vcpop.m a2, v0
+; RV64-NEXT: vsetvli zero, a2, e8, m2, ta, ma
+; RV64-NEXT: vle8.v v10, (a0)
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; RV64-NEXT: viota.m v12, v0
+; RV64-NEXT: vrgather.vv v8, v10, v12, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v32i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vsetvli zero, a2, e8, m2, ta, ma
+; RV32-NEXT: vle8.v v10, (a0)
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu
+; RV32-NEXT: viota.m v12, v0
+; RV32-NEXT: vrgather.vv v8, v10, v12, v0.t
+; RV32-NEXT: ret
+ %res = call <32 x i8> @llvm.masked.expandload.v32i8(ptr align 1 %base, <32 x i1> %mask, <32 x i8> %passthru)
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @test_expandload_v64i8(ptr %base, <64 x i1> %mask, <64 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v64i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT: vcpop.m a2, v0
+; RV64-NEXT: vsetvli zero, a2, e8, m4, ta, ma
+; RV64-NEXT: vle8.v v12, (a0)
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, mu
+; RV64-NEXT: viota.m v16, v0
+; RV64-NEXT: vrgather.vv v8, v12, v16, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v64i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vsetvli zero, a2, e8, m4, ta, ma
+; RV32-NEXT: vle8.v v12, (a0)
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, mu
+; RV32-NEXT: viota.m v16, v0
+; RV32-NEXT: vrgather.vv v8, v12, v16, v0.t
+; RV32-NEXT: ret
+ %res = call <64 x i8> @llvm.masked.expandload.v64i8(ptr align 1 %base, <64 x i1> %mask, <64 x i8> %passthru)
+ ret <64 x i8> %res
+}
+
+define <128 x i8> @test_expandload_v128i8(ptr %base, <128 x i1> %mask, <128 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v128i8:
+; RV64: # %bb.0:
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vcpop.m a2, v0
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v16, (a0)
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, mu
+; RV64-NEXT: viota.m v24, v0
+; RV64-NEXT: vrgather.vv v8, v16, v24, v0.t
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v128i8:
+; RV32: # %bb.0:
+; RV32-NEXT: li a1, 128
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vcpop.m a2, v0
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vle8.v v16, (a0)
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, mu
+; RV32-NEXT: viota.m v24, v0
+; RV32-NEXT: vrgather.vv v8, v16, v24, v0.t
+; RV32-NEXT: ret
+ %res = call <128 x i8> @llvm.masked.expandload.v128i8(ptr align 1 %base, <128 x i1> %mask, <128 x i8> %passthru)
+ ret <128 x i8> %res
+}
+
+define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8> %passthru) {
+; RV64-LABEL: test_expandload_v256i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 5
+; RV64-NEXT: sub sp, sp, a2
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: li a3, 24
+; RV64-NEXT: mul a2, a2, a3
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v7, v8
+; RV64-NEXT: li a2, 128
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v8, (a1)
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v0, 1
+; RV64-NEXT: vmv.x.s a1, v9
+; RV64-NEXT: vmv.x.s a3, v0
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vcpop.m a4, v0
+; RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v24, (a0)
+; RV64-NEXT: csrr a4, vlenb
+; RV64-NEXT: slli a4, a4, 4
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: addi a4, a4, 16
+; RV64-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vcpop.m a4, v7
+; RV64-NEXT: cpop a3, a3
+; RV64-NEXT: cpop a1, a1
+; RV64-NEXT: add a0, a0, a3
+; RV64-NEXT: add a0, a0, a1
+; RV64-NEXT: vsetvli zero, a4, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, mu
+; RV64-NEXT: viota.m v16, v0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vrgather.vv v8, v24, v16, v0.t
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: viota.m v16, v7
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: vmv1r.v v0, v7
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 4
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vrgather.vv v8, v24, v16, v0.t
+; RV64-NEXT: vmv.v.v v16, v8
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 24
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_expandload_v256i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 5
+; RV32-NEXT: sub sp, sp, a2
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: li a3, 24
+; RV32-NEXT: mul a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v7, v8
+; RV32-NEXT: li a2, 128
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vle8.v v8, (a1)
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 16
+; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v0, 1
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsrl.vx v10, v9, a1
+; RV32-NEXT: vmv.x.s a3, v10
+; RV32-NEXT: vsrl.vx v10, v0, a1
+; RV32-NEXT: vmv.x.s a1, v10
+; RV32-NEXT: vmv.x.s a4, v9
+; RV32-NEXT: vmv.x.s a5, v0
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vcpop.m a6, v0
+; RV32-NEXT: vsetvli zero, a6, e8, m8, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 4
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 16
+; RV32-NEXT: vs8r.v v8, (a6) # Unknown-size Folded Spill
+; RV32-NEXT: cpop a1, a1
+; RV32-NEXT: cpop a5, a5
+; RV32-NEXT: add a1, a5, a1
+; RV32-NEXT: cpop a3, a3
+; RV32-NEXT: cpop a4, a4
+; RV32-NEXT: add a3, a4, a3
+; RV32-NEXT: add a1, a1, a3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV32-NEXT: vcpop.m a1, v7
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vle8.v v8, (a0)
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, mu
+; RV32-NEXT: viota.m v16, v0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 24
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: viota.m v16, v7
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: vmv1r.v v0, v7
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, ...
[truncated]
|
; CHECK-NEXT: vle16.v v9, (a0) | ||
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu | ||
; CHECK-NEXT: viota.m v10, v0 | ||
; CHECK-NEXT: vrgather.vv v8, v9, v10, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this be done with a viota.m and an indexed load instead?
; v0 = 0 0 0 1 0 1 0 1
viota.m v9, v0 ; v9 = 3 3 3 2 2 1 1 0
vloxi64.v v8, a0, v9, v0.t ; v8 = v8.7, v8.6, v8.5, [a0+2], v8.3, [a0+1], v8.1, [a0+0]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point! Done!
Both indexed load and vrgather are slow on current implementations, using indexed load has fewer instructions. And indexed load can be optimized in μ-arch level, but it may not be easy for vrgather.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If anything, I'd imagine vrgather would be easier for hardware to optimize (though I am not a hardware designer), especially for smaller element types; I don't know of any implementation of any architecture where indexed loads are anything better than one memory request per element (i.e. ~3 elements/cycle at best on already very high-end cores), but 16×i8 shuffles are usually able to run at at least 1/cycle (even down to ARM A53; and modern x86-64 cores (i.e. those with 3 loads/cycle) can even do 512-bit shuffles at 1/cycle). LMUL makes things somewhat more complicated, but I'd hope general-purpose cores would be designed in a way that doesn't make vrgather literally never beneficial to use.
If preferring indexed loads over load+vrgather was desired, it should also be done for, say, a reversing loop, but it's not (https://godbolt.org/z/5rPW6aoxq)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A vrgather.vv is likely to be O(N^2) in terms of LMUL (since in the worst case scenario each register written to reads from LMUL registers), but vloxiN.v is O(N) in terms of VL. Or at least that's what we cost in RISCVTargetTransformInfo anyway, so we might as well be consistent there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dzaima Thanks! Good information for other architectures!
For vrgather.vv, you can check https://camel-cdr.github.io/rvv-bench-results/index.html and see the costs of current implementations.
For indexed loads/stores, if they are consecutive (which is the case for this patch), these loads/stores can be combined in μ-arch.
If preferring indexed loads over load+vrgather was desired, it should also be done for, say, a reversing loop, but it's not (https://godbolt.org/z/5rPW6aoxq)
For reversed loop, we would prefer strided load with nagative stride, here we just haven't optimized for this case in vectorizer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@wangpc-pp BPI-F3 there for vrgather at e8m1
can process eight elements in a cycle, and four at e16m1
. Milk-V Pioneer at e8m1
can do 32 elements per cycle, e16m1
- 16 elts/cycle, e32m1
- 8 elts/cycle, and even at e64m1
four elements per cycle (yes, that's XTheadVector not RVV1.0, but IIRC an equivalent RVV1.0 architecture is supposed to come out at some point). These cases should easily exceed the throughput of indexed loads on these architectures, even though the data on those aren't present. (things indeed get less pretty at higher LMUL though; I suppose cost models would be useful here)
For the two RVV1.0 architectures, there's a byteswap
benchmark, and the top performance is with vrgather
. (though, indexed loads aren't even there; I suppose camel-cdr either tested it and it was awful, or just automatically assumed it'd be significantly worse)
Theoretically future architectures could check the index list of indexed loads/stores and special-case patterns, but I haven't heard of any RVV implementations doing such, and, seeing as x86-64 has had indexed loads for a decade (and SVE to a smaller extent) but nothing like this optimization, I wouldn't bet on that becoming widespread. And even if it was optimized, it would have to be doing something equivalent to vrgather
internally anyway.
In general, every vle*.v + vrgather.vv
can be turned into a vluxei*.v
(if not a vlse*.v
), which could be done as a general optimization if desired.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These cases should easily exceed the throughput of indexed loads on these architectures
vrgather.vv doesn't perform the load though so I'm not sure if we can compare them directly. The vluxei*.v is kinda doing two in one.
I think the big performance concern is LMUL > 1, according to https://camel-cdr.github.io/rvv-bench-results/bpi_f3/index.html it's 16 cycles at e8m2 and 64 at e8m4 on the BPI-F3. The loop vectorizer uses LMUL 2 by default, if it ever learns to emit expanded loads.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the discussion! I think we may leave a TODO here to investigate which way is better:
vcpop.m+vleN.v+viota.m+vrgather.vv
vsetivli zero, 16, e8, m1, ta, ma
vcpop.m a1, v0
vsetvli zero, a1, e32, m4, ta, ma
vle32.v v12, (a0)
vsetivli zero, 16, e32, m4, ta, mu
viota.m v16, v0
vrgather.vv v8, v12, v16, v0.t
ret
It has:
- Consecutive unit-stride load, which is fast.
- Larger register pressure. We need to retain v12+v16 for above example.
- More vtype toggles (2).
- vrgather.vv may be slow for large LMUL.
viota.m+vsll.vi(optional)+vluxeiN.v
vsetivli zero, 16, e32, m4, ta, ma
viota.m v12, v0
vsll.vi v12, v12, 2, v0.t
vsetvli zero, zero, e32, m4, ta, mu
vluxei32.v v8, (a0), v12, v0.t
ret
It has:
- Indexed load, which is slower.
- Small register pressure. We only use v12 for above example.
- Fewer instructions (2 or 3) and vtype toggles (only 1).
Currently I prefer the second, but I don't have performance data to support this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I didn't add the index load/store to byteswap, because I saw it perform badly in the LUT4 benchmark, and you can already infer the performance from that.
I think the optimal implementation for a reverse would be LMUL=8 (or 4 if we want to be conservative) uni-stride load/store + 8/4 times unrolled LMUL=1 vrgather.vv.
The byteswap benchmark is basically the same case.
You can express a LMUL>1 vrgather operation that doesn't need to cross lanes, in multiple LMUL=1 vrgather operations, which gives you a O(N) scale with LMUL for vrgather.
I've only seen a single case where indexed loads beat vrgather, it's not by much (20%), and only once we are memory bound, otherwise vrgather is much faster (up to 4x): https://camel-cdr.github.io/rvv-bench-results/bpi_f3/LUT4.html
I presume this is caused by the particular processor supporting chaining for indexed loads, but not for vrgather.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Running some experiments on the Banana Pi F3, an expanded load via vrgather.vv is indeed about 4x faster at LMUL 1 and still about 2x faster at LMUL 4. At LMUL 8 the difference seems to level off.
Given this is going to be specific to the microarchitecture maybe we should have a subtarget feature to use vrgather.vv here, enabled by default for the spacemit-k1. I would like to be optimistic that at some point there will be hardware with a faster vluxei*.v implementation.
aa662d7
to
c3098af
Compare
c3098af
to
c769a69
Compare
; CHECK-NEXT: li a1, 32 | ||
; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma | ||
; CHECK-NEXT: viota.m v12, v0 | ||
; CHECK-NEXT: vsll.vi v12, v12, 1, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not for this PR, but as a follow up it would be good to narrow the LMUL of the indices to reduce register pressure
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
103a63c
to
38b0662
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
38b0662
to
8288bd6
Compare
@lukel97 I have added a subtarget feature to indicate whether indexed load/store instructions are optimal. |
LGTM - but please wait a bit in case Luke or Craig have remaining comments. I think everything has been addressed, but maybe I missed something. Please remember to update the review description before merging as that becomes the submission comment. |
We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.
0e35b2c
to
f14ae55
Compare
We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.
We can use `viota`+`vrgather` to synthesize `vdecompress` and lower expanding load to `vcpop`+`load`+`vdecompress`. And if `%mask` is all ones, we can lower expanding load to a normal unmasked load. Fixes llvm#101914.
We can use
viota
+vrgather
to synthesizevdecompress
and lowerexpanding load to
vcpop
+load
+vdecompress
.And if
%mask
is all ones, we can lower expanding load to a normalunmasked load.
Fixes #101914.