-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Support llvm.masked.compressstore
intrinsic
#83457
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Support llvm.masked.compressstore
intrinsic
#83457
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-risc-v Author: Kolya Panchenko (nikolaypanchenko) ChangesPatch is 592.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83457.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e647f56416bfa6..9be5138b05df07 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10460,6 +10460,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
SDValue BasePtr = MemSD->getBasePtr();
SDValue Val, Mask, VL;
+ bool IsCompressingStore = false;
if (const auto *VPStore = dyn_cast<VPStoreSDNode>(Op)) {
Val = VPStore->getValue();
Mask = VPStore->getMask();
@@ -10468,9 +10469,11 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
const auto *MStore = cast<MaskedStoreSDNode>(Op);
Val = MStore->getValue();
Mask = MStore->getMask();
+ IsCompressingStore = MStore->isCompressingStore();
}
- bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
+ bool IsUnmasked =
+ ISD::isConstantSplatVectorAllOnes(Mask.getNode()) || IsCompressingStore;
MVT VT = Val.getSimpleValueType();
MVT XLenVT = Subtarget.getXLenVT();
@@ -10480,7 +10483,7 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
ContainerVT = getContainerForFixedLengthVector(VT);
Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget);
- if (!IsUnmasked) {
+ if (!IsUnmasked || IsCompressingStore) {
MVT MaskVT = getMaskTypeFor(ContainerVT);
Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
}
@@ -10489,6 +10492,15 @@ SDValue RISCVTargetLowering::lowerMaskedStore(SDValue Op,
if (!VL)
VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+ if (IsCompressingStore) {
+ Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
+ DAG.getConstant(Intrinsic::riscv_vcompress, DL, XLenVT),
+ DAG.getUNDEF(ContainerVT), Val, Mask, VL);
+ VL =
+ DAG.getNode(RISCVISD::VCPOP_VL, DL, XLenVT, Mask,
+ getAllOnesMask(Mask.getSimpleValueType(), VL, DL, DAG), VL);
+ }
+
unsigned IntID =
IsUnmasked ? Intrinsic::riscv_vse : Intrinsic::riscv_vse_mask;
SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 2e4e69fb4f920f..b54474b14b0543 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1609,3 +1609,10 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
C2.NumIVMuls, C2.NumBaseAdds,
C2.ScaleCost, C2.ImmCost, C2.SetupCost);
}
+
+bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
+ auto *VTy = dyn_cast<VectorType>(DataTy);
+ if (!VTy || VTy->isScalableTy() || !ST->hasVInstructions())
+ return false;
+ return getRegUsageForType(VTy) <= 8;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index af36e9d5d5e886..6433027cce0e27 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -261,6 +261,8 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
}
+ bool isLegalMaskedCompressStore(Type *DataTy);
+
bool isVScaleKnownToBeAPowerOfTwo() const {
return TLI->isVScaleKnownToBeAPowerOfTwo();
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
new file mode 100644
index 00000000000000..f227f39740a003
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -0,0 +1,17545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs -mtriple=riscv64 -mattr=+v,+d %s -o - | FileCheck %s --check-prefix=RV64
+; RUN: llc -verify-machineinstrs -mtriple=riscv32 -mattr=+v,+d %s -o - | FileCheck %s --check-prefix=RV32
+
+; Compress + store for i8 type
+
+define void @test_compresstore_i8_v1(ptr %p, <1 x i1> %mask, <1 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v1:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v1:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v1i8(<1 x i8> %data, ptr %p, <1 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v2(ptr %p, <2 x i1> %mask, <2 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v2:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v2:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf8, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v2i8(<2 x i8> %data, ptr %p, <2 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v4(ptr %p, <4 x i1> %mask, <4 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v4:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v4:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v4i8(<4 x i8> %data, ptr %p, <4 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v8(ptr %p, <8 x i1> %mask, <8 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v8:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v8:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v8i8(<8 x i8> %data, ptr %p, <8 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v16(ptr %p, <16 x i1> %mask, <16 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v16:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT: vcompress.vm v9, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV64-NEXT: vse8.v v9, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v16:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT: vcompress.vm v9, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; RV32-NEXT: vse8.v v9, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %data, ptr %p, <16 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v32(ptr %p, <32 x i1> %mask, <32 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v32:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT: vcompress.vm v10, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV64-NEXT: vse8.v v10, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v32:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 32
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT: vcompress.vm v10, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m2, ta, ma
+; RV32-NEXT: vse8.v v10, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %data, ptr %p, <32 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v64(ptr %p, <64 x i1> %mask, <64 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 64
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT: vcompress.vm v12, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV64-NEXT: vse8.v v12, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 64
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT: vcompress.vm v12, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m4, ta, ma
+; RV32-NEXT: vse8.v v12, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %data, ptr %p, <64 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v128(ptr %p, <128 x i1> %mask, <128 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v128:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vcompress.vm v16, v8, v0
+; RV64-NEXT: vcpop.m a1, v0
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v16, (a0)
+; RV64-NEXT: ret
+;
+; RV32-LABEL: test_compresstore_i8_v128:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 128
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vcompress.vm v16, v8, v0
+; RV32-NEXT: vcpop.m a1, v0
+; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV32-NEXT: vse8.v v16, (a0)
+; RV32-NEXT: ret
+entry:
+ tail call void @llvm.masked.compressstore.v128i8(<128 x i8> %data, ptr %p, <128 x i1> %mask)
+ ret void
+}
+
+define void @test_compresstore_i8_v256(ptr %p, <256 x i1> %mask, <256 x i8> %data) {
+; RV64-LABEL: test_compresstore_i8_v256:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a2, 128
+; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma
+; RV64-NEXT: vle8.v v24, (a1)
+; RV64-NEXT: vsetvli zero, a2, e64, m1, ta, ma
+; RV64-NEXT: vmv.x.s a2, v0
+; RV64-NEXT: andi a1, a2, 1
+; RV64-NEXT: bnez a1, .LBB8_273
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a1, a2, 2
+; RV64-NEXT: bnez a1, .LBB8_274
+; RV64-NEXT: .LBB8_2: # %else2
+; RV64-NEXT: andi a1, a2, 4
+; RV64-NEXT: bnez a1, .LBB8_275
+; RV64-NEXT: .LBB8_3: # %else5
+; RV64-NEXT: andi a1, a2, 8
+; RV64-NEXT: bnez a1, .LBB8_276
+; RV64-NEXT: .LBB8_4: # %else8
+; RV64-NEXT: andi a1, a2, 16
+; RV64-NEXT: bnez a1, .LBB8_277
+; RV64-NEXT: .LBB8_5: # %else11
+; RV64-NEXT: andi a1, a2, 32
+; RV64-NEXT: bnez a1, .LBB8_278
+; RV64-NEXT: .LBB8_6: # %else14
+; RV64-NEXT: andi a1, a2, 64
+; RV64-NEXT: bnez a1, .LBB8_279
+; RV64-NEXT: .LBB8_7: # %else17
+; RV64-NEXT: andi a1, a2, 128
+; RV64-NEXT: bnez a1, .LBB8_280
+; RV64-NEXT: .LBB8_8: # %else20
+; RV64-NEXT: andi a1, a2, 256
+; RV64-NEXT: bnez a1, .LBB8_281
+; RV64-NEXT: .LBB8_9: # %else23
+; RV64-NEXT: andi a1, a2, 512
+; RV64-NEXT: bnez a1, .LBB8_282
+; RV64-NEXT: .LBB8_10: # %else26
+; RV64-NEXT: andi a1, a2, 1024
+; RV64-NEXT: bnez a1, .LBB8_283
+; RV64-NEXT: .LBB8_11: # %else29
+; RV64-NEXT: slli a1, a2, 52
+; RV64-NEXT: bltz a1, .LBB8_284
+; RV64-NEXT: .LBB8_12: # %else32
+; RV64-NEXT: slli a1, a2, 51
+; RV64-NEXT: bltz a1, .LBB8_285
+; RV64-NEXT: .LBB8_13: # %else35
+; RV64-NEXT: slli a1, a2, 50
+; RV64-NEXT: bltz a1, .LBB8_286
+; RV64-NEXT: .LBB8_14: # %else38
+; RV64-NEXT: slli a1, a2, 49
+; RV64-NEXT: bltz a1, .LBB8_287
+; RV64-NEXT: .LBB8_15: # %else41
+; RV64-NEXT: slli a1, a2, 48
+; RV64-NEXT: bltz a1, .LBB8_288
+; RV64-NEXT: .LBB8_16: # %else44
+; RV64-NEXT: slli a1, a2, 47
+; RV64-NEXT: bltz a1, .LBB8_289
+; RV64-NEXT: .LBB8_17: # %else47
+; RV64-NEXT: slli a1, a2, 46
+; RV64-NEXT: bltz a1, .LBB8_290
+; RV64-NEXT: .LBB8_18: # %else50
+; RV64-NEXT: slli a1, a2, 45
+; RV64-NEXT: bltz a1, .LBB8_291
+; RV64-NEXT: .LBB8_19: # %else53
+; RV64-NEXT: slli a1, a2, 44
+; RV64-NEXT: bltz a1, .LBB8_292
+; RV64-NEXT: .LBB8_20: # %else56
+; RV64-NEXT: slli a1, a2, 43
+; RV64-NEXT: bltz a1, .LBB8_293
+; RV64-NEXT: .LBB8_21: # %else59
+; RV64-NEXT: slli a1, a2, 42
+; RV64-NEXT: bltz a1, .LBB8_294
+; RV64-NEXT: .LBB8_22: # %else62
+; RV64-NEXT: slli a1, a2, 41
+; RV64-NEXT: bgez a1, .LBB8_23
+; RV64-NEXT: j .LBB8_295
+; RV64-NEXT: .LBB8_23: # %else65
+; RV64-NEXT: slli a1, a2, 40
+; RV64-NEXT: bgez a1, .LBB8_24
+; RV64-NEXT: j .LBB8_296
+; RV64-NEXT: .LBB8_24: # %else68
+; RV64-NEXT: slli a1, a2, 39
+; RV64-NEXT: bgez a1, .LBB8_25
+; RV64-NEXT: j .LBB8_297
+; RV64-NEXT: .LBB8_25: # %else71
+; RV64-NEXT: slli a1, a2, 38
+; RV64-NEXT: bgez a1, .LBB8_26
+; RV64-NEXT: j .LBB8_298
+; RV64-NEXT: .LBB8_26: # %else74
+; RV64-NEXT: slli a1, a2, 37
+; RV64-NEXT: bgez a1, .LBB8_27
+; RV64-NEXT: j .LBB8_299
+; RV64-NEXT: .LBB8_27: # %else77
+; RV64-NEXT: slli a1, a2, 36
+; RV64-NEXT: bgez a1, .LBB8_28
+; RV64-NEXT: j .LBB8_300
+; RV64-NEXT: .LBB8_28: # %else80
+; RV64-NEXT: slli a1, a2, 35
+; RV64-NEXT: bgez a1, .LBB8_29
+; RV64-NEXT: j .LBB8_301
+; RV64-NEXT: .LBB8_29: # %else83
+; RV64-NEXT: slli a1, a2, 34
+; RV64-NEXT: bgez a1, .LBB8_30
+; RV64-NEXT: j .LBB8_302
+; RV64-NEXT: .LBB8_30: # %else86
+; RV64-NEXT: slli a1, a2, 33
+; RV64-NEXT: bgez a1, .LBB8_31
+; RV64-NEXT: j .LBB8_303
+; RV64-NEXT: .LBB8_31: # %else89
+; RV64-NEXT: slli a1, a2, 32
+; RV64-NEXT: bgez a1, .LBB8_33
+; RV64-NEXT: .LBB8_32: # %cond.store91
+; RV64-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v16, 31
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vse8.v v10, (a0)
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: .LBB8_33: # %else92
+; RV64-NEXT: addi sp, sp, -2032
+; RV64-NEXT: .cfi_def_cfa_offset 2032
+; RV64-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: addi s0, sp, 2032
+; RV64-NEXT: .cfi_def_cfa s0, 0
+; RV64-NEXT: lui a1, 6
+; RV64-NEXT: addiw a1, a1, -1776
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: andi sp, sp, -128
+; RV64-NEXT: slli a3, a2, 31
+; RV64-NEXT: lui a1, 6
+; RV64-NEXT: addiw a1, a1, -984
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: bgez a3, .LBB8_34
+; RV64-NEXT: j .LBB8_304
+; RV64-NEXT: .LBB8_34: # %else95
+; RV64-NEXT: slli a3, a2, 30
+; RV64-NEXT: bgez a3, .LBB8_35
+; RV64-NEXT: j .LBB8_305
+; RV64-NEXT: .LBB8_35: # %else98
+; RV64-NEXT: slli a3, a2, 29
+; RV64-NEXT: bgez a3, .LBB8_36
+; RV64-NEXT: j .LBB8_306
+; RV64-NEXT: .LBB8_36: # %else101
+; RV64-NEXT: slli a3, a2, 28
+; RV64-NEXT: bgez a3, .LBB8_37
+; RV64-NEXT: j .LBB8_307
+; RV64-NEXT: .LBB8_37: # %else104
+; RV64-NEXT: slli a3, a2, 27
+; RV64-NEXT: bgez a3, .LBB8_38
+; RV64-NEXT: j .LBB8_308
+; RV64-NEXT: .LBB8_38: # %else107
+; RV64-NEXT: slli a3, a2, 26
+; RV64-NEXT: bgez a3, .LBB8_39
+; RV64-NEXT: j .LBB8_309
+; RV64-NEXT: .LBB8_39: # %else110
+; RV64-NEXT: slli a3, a2, 25
+; RV64-NEXT: bgez a3, .LBB8_40
+; RV64-NEXT: j .LBB8_310
+; RV64-NEXT: .LBB8_40: # %else113
+; RV64-NEXT: slli a3, a2, 24
+; RV64-NEXT: bgez a3, .LBB8_41
+; RV64-NEXT: j .LBB8_311
+; RV64-NEXT: .LBB8_41: # %else116
+; RV64-NEXT: slli a3, a2, 23
+; RV64-NEXT: bgez a3, .LBB8_43
+; RV64-NEXT: .LBB8_42: # %cond.store118
+; RV64-NEXT: li a3, 128
+; RV64-NEXT: li a4, 23
+; RV64-NEXT: slli a4, a4, 10
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v16, (a4)
+; RV64-NEXT: lbu a1, 0(a1)
+; RV64-NEXT: sb a1, 0(a0)
+; RV64-NEXT: addi a0, a0, 1
+; RV64-NEXT: .LBB8_43: # %else119
+; RV64-NEXT: slli a3, a2, 22
+; RV64-NEXT: lui a1, 5
+; RV64-NEXT: addiw a1, a1, 953
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: bgez a3, .LBB8_44
+; RV64-NEXT: j .LBB8_312
+; RV64-NEXT: .LBB8_44: # %else122
+; RV64-NEXT: slli a3, a2, 21
+; RV64-NEXT: bgez a3, .LBB8_45
+; RV64-NEXT: j .LBB8_313
+; RV64-NEXT: .LBB8_45: # %else125
+; RV64-NEXT: slli a3, a2, 20
+; RV64-NEXT: bgez a3, .LBB8_46
+; RV64-NEXT: j .LBB8_314
+; RV64-NEXT: .LBB8_46: # %else128
+; RV64-NEXT: slli a3, a2, 19
+; RV64-NEXT: bgez a3, .LBB8_47
+; RV64-NEXT: j .LBB8_315
+; RV64-NEXT: .LBB8_47: # %else131
+; RV64-NEXT: slli a3, a2, 18
+; RV64-NEXT: bgez a3, .LBB8_48
+; RV64-NEXT: j .LBB8_316
+; RV64-NEXT: .LBB8_48: # %else134
+; RV64-NEXT: slli a3, a2, 17
+; RV64-NEXT: bgez a3, .LBB8_49
+; RV64-NEXT: j .LBB8_317
+; RV64-NEXT: .LBB8_49: # %else137
+; RV64-NEXT: slli a3, a2, 16
+; RV64-NEXT: bgez a3, .LBB8_50
+; RV64-NEXT: j .LBB8_318
+; RV64-NEXT: .LBB8_50: # %else140
+; RV64-NEXT: slli a3, a2, 15
+; RV64-NEXT: bgez a3, .LBB8_51
+; RV64-NEXT: j .LBB8_319
+; RV64-NEXT: .LBB8_51: # %else143
+; RV64-NEXT: slli a3, a2, 14
+; RV64-NEXT: bgez a3, .LBB8_52
+; RV64-NEXT: j .LBB8_320
+; RV64-NEXT: .LBB8_52: # %else146
+; RV64-NEXT: slli a3, a2, 13
+; RV64-NEXT: bgez a3, .LBB8_53
+; RV64-NEXT: j .LBB8_321
+; RV64-NEXT: .LBB8_53: # %else149
+; RV64-NEXT: slli a3, a2, 12
+; RV64-NEXT: bgez a3, .LBB8_54
+; RV64-NEXT: j .LBB8_322
+; RV64-NEXT: .LBB8_54: # %else152
+; RV64-NEXT: slli a3, a2, 11
+; RV64-NEXT: bgez a3, .LBB8_55
+; RV64-NEXT: j .LBB8_323
+; RV64-NEXT: .LBB8_55: # %else155
+; RV64-NEXT: slli a3, a2, 10
+; RV64-NEXT: bgez a3, .LBB8_56
+; RV64-NEXT: j .LBB8_324
+; RV64-NEXT: .LBB8_56: # %else158
+; RV64-NEXT: slli a3, a2, 9
+; RV64-NEXT: bgez a3, .LBB8_57
+; RV64-NEXT: j .LBB8_325
+; RV64-NEXT: .LBB8_57: # %else161
+; RV64-NEXT: slli a3, a2, 8
+; RV64-NEXT: bgez a3, .LBB8_58
+; RV64-NEXT: j .LBB8_326
+; RV64-NEXT: .LBB8_58: # %else164
+; RV64-NEXT: slli a3, a2, 7
+; RV64-NEXT: bgez a3, .LBB8_59
+; RV64-NEXT: j .LBB8_327
+; RV64-NEXT: .LBB8_59: # %else167
+; RV64-NEXT: slli a3, a2, 6
+; RV64-NEXT: bgez a3, .LBB8_61
+; RV64-NEXT: .LBB8_60: # %cond.store169
+; RV64-NEXT: li a3, 128
+; RV64-NEXT: lui a4, 5
+; RV64-NEXT: addiw a4, a4, 896
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v16, (a4)
+; RV64-NEXT: lbu a1, 0(a1)
+; RV64-NEXT: addi a3, a0, 1
+; RV64-NEXT: sb a1, 0(a0)
+; RV64-NEXT: mv a0, a3
+; RV64-NEXT: .LBB8_61: # %else170
+; RV64-NEXT: slli a1, a2, 5
+; RV64-NEXT: lui a3, 5
+; RV64-NEXT: addiw a3, a3, -1206
+; RV64-NEXT: add a3, sp, a3
+; RV64-NEXT: bgez a1, .LBB8_62
+; RV64-NEXT: j .LBB8_328
+; RV64-NEXT: .LBB8_62: # %else173
+; RV64-NEXT: slli a1, a2, 4
+; RV64-NEXT: bgez a1, .LBB8_63
+; RV64-NEXT: j .LBB8_329
+; RV64-NEXT: .LBB8_63: # %else176
+; RV64-NEXT: slli a1, a2, 3
+; RV64-NEXT: bgez a1, .LBB8_64
+; RV64-NEXT: j .LBB8_330
+; RV64-NEXT: .LBB8_64: # %else179
+; RV64-NEXT: slli a1, a2, 2
+; RV64-NEXT: bgez a1, .LBB8_66
+; RV64-NEXT: .LBB8_65: # %cond.store181
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: lui a4, 5
+; RV64-NEXT: addiw a4, a4, 384
+; RV64-NEXT: add a4, sp, a4
+; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma
+; RV64-NEXT: vse8.v v16, (a4)
+; RV64-NEXT: lbu a...
[truncated]
|
; RV32-NEXT: sb a4, 0(a0) | ||
; RV32-NEXT: mv a0, a5 | ||
; RV32-NEXT: slli a4, a3, 15 | ||
; RV32-NEXT: bltz a4, .LBB |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this codegen right? It's so huge.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like there's no alignment information so the scalar stores got split
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this case the compressstore was scalarized by ScalarizeMaskedMemIntrin
pass due to the fact the data occupies 16 registers, i.e. requires lmul=16, which won't be easy to legalize.
I'll provide explanation in the description
@@ -1609,3 +1609,10 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, | |||
C2.NumIVMuls, C2.NumBaseAdds, | |||
C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
} | |||
|
|||
bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This interface needs an alignment parameter doesn't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For RVV it's not required. The store of the compressed data is unaligned, i.e. it should be possible to use vse8
for non-naturally aligned memory
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We would need to scale the VL from the vcpop to use vse8
.
Doesn't look like alignment was accounted for in the intrinsic definition. masked.gather/scatter had a separate argument. vp.gather/scatter use a pointer attribute. I think we should use the pointer attribute for compress store.
It looks like SelectionDAGBuilder::visitMaskedStore currently sets the alignment to the ABI alignment of the vector type which doesn't make sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Once we fix the alignment for the intrinsic we'll also need to teach ScalarMaskedMemIntrinsic to preserve it when scalarizing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Once we fix the alignment for the intrinsic we'll also need to teach ScalarMaskedMemIntrinsic to preserve it when scalarizing.
To emphasize, this should be done in a separate review and should probably be addressed before we continue too far with this one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had created a PR #83519 to preserving its alignment when scalarizing.
Can you separate your NFC to add the Align parameter to TTI into it's own review? It's clearly needed, we should be able to land that one quickly and rebase this on top. |
Absolutely. #83516 |
81f41d8
to
a307f04
Compare
@@ -1620,3 +1620,15 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, | |||
C2.NumIVMuls, C2.NumBaseAdds, | |||
C2.ScaleCost, C2.ImmCost, C2.SetupCost); | |||
} | |||
|
|||
bool RISCVTTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need to check whether the fixed vector type is legal like isLegalMaskedLoadStore().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we could also just use isLegalMaskedLoadStore()
for compress/expandload like masked.load/store?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unlike x86, if store is done via separate instruction and requires masking, RVV's implementation does not require masking/predication due to tweaked VL.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check being asked for is that fixed vectors are supported. Not that the specific type is legal. We still have a master disable for fixed vectors in the backend by setting -rvv-vector-bits-min=0.
// Only support fixed vectors if we know the minimum vector size.
if (DataTypeVT.isFixedLengthVector() && !ST->useRVVForFixedLengthVectors())
return false;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to test for scalable vectors too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TTI part of the changeset disallows scalable vectors. The legalization part in ScalarizeMaskedMemIntrin
cannot handle them either.
It might be better to relax TTI constraint and add tests for scalable vectors in followup patch, what do you think ?
TLI->getValueType(DL, VTy->getElementType()))) | ||
return false; | ||
|
||
return getRegUsageForType(VTy) <= 8; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this needed? The backend should be able to split a compressed store.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not for the store, but for the compress. Splitting it for LMUL >= 16 is doable, but is not simple
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The splitting will happen while it is still a compressedstore. The compress instruction isn't created until after the split is already done.
It should create a second compressed store whose pointer is the basepointer+popcnt(lower_half_mask)
if (!TLI->isLegalElementTypeForRVV( | ||
TLI->getValueType(DL, VTy->getElementType()))) | ||
return false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to check alignment unless you add misaligned support to RISCVISelLowering.cpp
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok. I'll add isLegalMaskedLoadStore
as @yetingk suggested.
a307f04
to
f5fb36d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The changeset enables lowering of
llvm.masked.compressstore(%data, %ptr, %mask)
for RVV for fixed vector type into:Such lowering is only possible when
%data
fits into available LMULs and otherwisellvm.masked.compressstore
is scalarized byScalarizeMaskedMemIntrin
pass.Even though RVV spec in the section
15.8
provide alternative sequence for compressstore, use ofvcompress + vcpop
should be a proper canonical form to lowerllvm.masked.compressstore
. If RISC-V target find the sequence from15.8
better, peephole optimization can transformvcompress + vcpop
into that sequence.