-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[ScalarizeMaskedMemIntrin] Use pointer alignment from pointer of masked.compressstore/expandload. #83519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ed.compressstore/expandload. Previously we used Align(1) for all scalarized load/stores from masked.compressstore/expandload. For targets not supporting unaligned accesses, it make backend need to split aligned large width loads/stores to byte loads/stores. To solve this performance issue, this patch preserves the alignment of base pointer after scalarizing.
@llvm/pr-subscribers-llvm-transforms Author: Yeting Kuo (yetingk) ChangesPreviously we used Align(1) for all scalarized load/stores from masked.compressstore/expandload. Patch is 153.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83519.diff 5 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index c01d03f6447240..2fd5530ad0d0cc 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -627,6 +627,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
Value *Ptr = CI->getArgOperand(0);
Value *Mask = CI->getArgOperand(1);
Value *PassThru = CI->getArgOperand(2);
+ Align Alignment = Ptr->getPointerAlignment(DL);
auto *VecType = cast<FixedVectorType>(CI->getType());
@@ -659,7 +660,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
} else {
Value *NewPtr =
Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
- InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
+ InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Alignment,
"Load" + Twine(Idx));
ShuffleMask[Idx] = Idx;
++MemIndex;
@@ -713,7 +714,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI,
CondBlock->setName("cond.load");
Builder.SetInsertPoint(CondBlock->getTerminator());
- LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
+ LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Alignment);
Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
// Move the pointer if there are more blocks to come.
@@ -755,6 +756,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Value *Src = CI->getArgOperand(0);
Value *Ptr = CI->getArgOperand(1);
Value *Mask = CI->getArgOperand(2);
+ Align Alignment = Ptr->getPointerAlignment(DL);
auto *VecType = cast<FixedVectorType>(Src->getType());
@@ -778,7 +780,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Value *OneElt =
Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
- Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
+ Builder.CreateAlignedStore(OneElt, NewPtr, Alignment);
++MemIndex;
}
CI->eraseFromParent();
@@ -824,7 +826,7 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI,
Builder.SetInsertPoint(CondBlock->getTerminator());
Value *OneElt = Builder.CreateExtractElement(Src, Idx);
- Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
+ Builder.CreateAlignedStore(OneElt, Ptr, Alignment);
// Move the pointer if there are more blocks to come.
Value *NewPtr;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
new file mode 100644
index 00000000000000..8989a0c9f2ce1c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-compressstore-fp.ll
@@ -0,0 +1,1079 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=RV64
+
+declare void @llvm.masked.compressstore.v1f16(<1 x half>, ptr, <1 x i1>)
+define void @compressstore_v1f16(ptr align 2 %base, <1 x half> %v, <1 x i1> %mask) {
+; RV32-LABEL: compressstore_v1f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vfirst.m a1, v0
+; RV32-NEXT: bnez a1, .LBB0_2
+; RV32-NEXT: # %bb.1: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: .LBB0_2: # %else
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v1f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vfirst.m a1, v0
+; RV64-NEXT: bnez a1, .LBB0_2
+; RV64-NEXT: # %bb.1: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: .LBB0_2: # %else
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v1f16(<1 x half> %v, ptr %base, <1 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v2f16(<2 x half>, ptr, <2 x i1>)
+define void @compressstore_v2f16(ptr align 2 %base, <2 x half> %v, <2 x i1> %mask) {
+; RV32-LABEL: compressstore_v2f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB1_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: bnez a1, .LBB1_4
+; RV32-NEXT: .LBB1_2: # %else2
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB1_3: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: beqz a1, .LBB1_2
+; RV32-NEXT: .LBB1_4: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v2f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB1_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: bnez a1, .LBB1_4
+; RV64-NEXT: .LBB1_2: # %else2
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB1_3: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: beqz a1, .LBB1_2
+; RV64-NEXT: .LBB1_4: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v2f16(<2 x half> %v, ptr %base, <2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v4f16(<4 x half>, ptr, <4 x i1>)
+define void @compressstore_v4f16(ptr align 2 %base, <4 x half> %v, <4 x i1> %mask) {
+; RV32-LABEL: compressstore_v4f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB2_5
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB2_6
+; RV32-NEXT: .LBB2_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB2_7
+; RV32-NEXT: .LBB2_3: # %else5
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: bnez a1, .LBB2_8
+; RV32-NEXT: .LBB2_4: # %else8
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB2_5: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB2_2
+; RV32-NEXT: .LBB2_6: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB2_3
+; RV32-NEXT: .LBB2_7: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: beqz a1, .LBB2_4
+; RV32-NEXT: .LBB2_8: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v4f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB2_5
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB2_6
+; RV64-NEXT: .LBB2_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: bnez a2, .LBB2_7
+; RV64-NEXT: .LBB2_3: # %else5
+; RV64-NEXT: andi a1, a1, 8
+; RV64-NEXT: bnez a1, .LBB2_8
+; RV64-NEXT: .LBB2_4: # %else8
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB2_5: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: beqz a2, .LBB2_2
+; RV64-NEXT: .LBB2_6: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: beqz a2, .LBB2_3
+; RV64-NEXT: .LBB2_7: # %cond.store4
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, 8
+; RV64-NEXT: beqz a1, .LBB2_4
+; RV64-NEXT: .LBB2_8: # %cond.store7
+; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 3
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v4f16(<4 x half> %v, ptr %base, <4 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v8f16(<8 x half>, ptr, <8 x i1>)
+define void @compressstore_v8f16(ptr align 2 %base, <8 x half> %v, <8 x i1> %mask) {
+; RV32-LABEL: compressstore_v8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB3_9
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB3_10
+; RV32-NEXT: .LBB3_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB3_11
+; RV32-NEXT: .LBB3_3: # %else5
+; RV32-NEXT: andi a2, a1, 8
+; RV32-NEXT: bnez a2, .LBB3_12
+; RV32-NEXT: .LBB3_4: # %else8
+; RV32-NEXT: andi a2, a1, 16
+; RV32-NEXT: bnez a2, .LBB3_13
+; RV32-NEXT: .LBB3_5: # %else11
+; RV32-NEXT: andi a2, a1, 32
+; RV32-NEXT: bnez a2, .LBB3_14
+; RV32-NEXT: .LBB3_6: # %else14
+; RV32-NEXT: andi a2, a1, 64
+; RV32-NEXT: bnez a2, .LBB3_15
+; RV32-NEXT: .LBB3_7: # %else17
+; RV32-NEXT: andi a1, a1, -128
+; RV32-NEXT: bnez a1, .LBB3_16
+; RV32-NEXT: .LBB3_8: # %else20
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB3_9: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB3_2
+; RV32-NEXT: .LBB3_10: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB3_3
+; RV32-NEXT: .LBB3_11: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 8
+; RV32-NEXT: beqz a2, .LBB3_4
+; RV32-NEXT: .LBB3_12: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 3
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 16
+; RV32-NEXT: beqz a2, .LBB3_5
+; RV32-NEXT: .LBB3_13: # %cond.store10
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 32
+; RV32-NEXT: beqz a2, .LBB3_6
+; RV32-NEXT: .LBB3_14: # %cond.store13
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 5
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a2, a1, 64
+; RV32-NEXT: beqz a2, .LBB3_7
+; RV32-NEXT: .LBB3_15: # %cond.store16
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 6
+; RV32-NEXT: vse16.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 2
+; RV32-NEXT: andi a1, a1, -128
+; RV32-NEXT: beqz a1, .LBB3_8
+; RV32-NEXT: .LBB3_16: # %cond.store19
+; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 7
+; RV32-NEXT: vse16.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB3_9
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB3_10
+; RV64-NEXT: .LBB3_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: bnez a2, .LBB3_11
+; RV64-NEXT: .LBB3_3: # %else5
+; RV64-NEXT: andi a2, a1, 8
+; RV64-NEXT: bnez a2, .LBB3_12
+; RV64-NEXT: .LBB3_4: # %else8
+; RV64-NEXT: andi a2, a1, 16
+; RV64-NEXT: bnez a2, .LBB3_13
+; RV64-NEXT: .LBB3_5: # %else11
+; RV64-NEXT: andi a2, a1, 32
+; RV64-NEXT: bnez a2, .LBB3_14
+; RV64-NEXT: .LBB3_6: # %else14
+; RV64-NEXT: andi a2, a1, 64
+; RV64-NEXT: bnez a2, .LBB3_15
+; RV64-NEXT: .LBB3_7: # %else17
+; RV64-NEXT: andi a1, a1, -128
+; RV64-NEXT: bnez a1, .LBB3_16
+; RV64-NEXT: .LBB3_8: # %else20
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB3_9: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: beqz a2, .LBB3_2
+; RV64-NEXT: .LBB3_10: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 1
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEXT: beqz a2, .LBB3_3
+; RV64-NEXT: .LBB3_11: # %cond.store4
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 8
+; RV64-NEXT: beqz a2, .LBB3_4
+; RV64-NEXT: .LBB3_12: # %cond.store7
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 3
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 16
+; RV64-NEXT: beqz a2, .LBB3_5
+; RV64-NEXT: .LBB3_13: # %cond.store10
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 32
+; RV64-NEXT: beqz a2, .LBB3_6
+; RV64-NEXT: .LBB3_14: # %cond.store13
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 5
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a2, a1, 64
+; RV64-NEXT: beqz a2, .LBB3_7
+; RV64-NEXT: .LBB3_15: # %cond.store16
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 6
+; RV64-NEXT: vse16.v v9, (a0)
+; RV64-NEXT: addi a0, a0, 2
+; RV64-NEXT: andi a1, a1, -128
+; RV64-NEXT: beqz a1, .LBB3_8
+; RV64-NEXT: .LBB3_16: # %cond.store19
+; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 7
+; RV64-NEXT: vse16.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v8f16(<8 x half> %v, ptr %base, <8 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v1f32(<1 x float>, ptr, <1 x i1>)
+define void @compressstore_v1f32(ptr align 4 %base, <1 x float> %v, <1 x i1> %mask) {
+; RV32-LABEL: compressstore_v1f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vfirst.m a1, v0
+; RV32-NEXT: bnez a1, .LBB4_2
+; RV32-NEXT: # %bb.1: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: .LBB4_2: # %else
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v1f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vfirst.m a1, v0
+; RV64-NEXT: bnez a1, .LBB4_2
+; RV64-NEXT: # %bb.1: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: .LBB4_2: # %else
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v1f32(<1 x float> %v, ptr %base, <1 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v2f32(<2 x float>, ptr, <2 x i1>)
+define void @compressstore_v2f32(ptr align 4 %base, <2 x float> %v, <2 x i1> %mask) {
+; RV32-LABEL: compressstore_v2f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB5_3
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: bnez a1, .LBB5_4
+; RV32-NEXT: .LBB5_2: # %else2
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB5_3: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a1, a1, 2
+; RV32-NEXT: beqz a1, .LBB5_2
+; RV32-NEXT: .LBB5_4: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 1
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v2f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB5_3
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: bnez a1, .LBB5_4
+; RV64-NEXT: .LBB5_2: # %else2
+; RV64-NEXT: ret
+; RV64-NEXT: .LBB5_3: # %cond.store
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: addi a0, a0, 4
+; RV64-NEXT: andi a1, a1, 2
+; RV64-NEXT: beqz a1, .LBB5_2
+; RV64-NEXT: .LBB5_4: # %cond.store1
+; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 1
+; RV64-NEXT: vse32.v v8, (a0)
+; RV64-NEXT: ret
+ call void @llvm.masked.compressstore.v2f32(<2 x float> %v, ptr %base, <2 x i1> %mask)
+ ret void
+}
+
+declare void @llvm.masked.compressstore.v4f32(<4 x float>, ptr, <4 x i1>)
+define void @compressstore_v4f32(ptr align 4 %base, <4 x float> %v, <4 x i1> %mask) {
+; RV32-LABEL: compressstore_v4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT: vmv.x.s a1, v0
+; RV32-NEXT: andi a2, a1, 1
+; RV32-NEXT: bnez a2, .LBB6_5
+; RV32-NEXT: # %bb.1: # %else
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: bnez a2, .LBB6_6
+; RV32-NEXT: .LBB6_2: # %else2
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: bnez a2, .LBB6_7
+; RV32-NEXT: .LBB6_3: # %else5
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: bnez a1, .LBB6_8
+; RV32-NEXT: .LBB6_4: # %else8
+; RV32-NEXT: ret
+; RV32-NEXT: .LBB6_5: # %cond.store
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a2, a1, 2
+; RV32-NEXT: beqz a2, .LBB6_2
+; RV32-NEXT: .LBB6_6: # %cond.store1
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 1
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a2, a1, 4
+; RV32-NEXT: beqz a2, .LBB6_3
+; RV32-NEXT: .LBB6_7: # %cond.store4
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: vse32.v v9, (a0)
+; RV32-NEXT: addi a0, a0, 4
+; RV32-NEXT: andi a1, a1, 8
+; RV32-NEXT: beqz a1, .LBB6_4
+; RV32-NEXT: .LBB6_8: # %cond.store7
+; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 3
+; RV32-NEXT: vse32.v v8, (a0)
+; RV32-NEXT: ret
+;
+; RV64-LABEL: compressstore_v4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT: vmv.x.s a1, v0
+; RV64-NEXT: andi a2, a1, 1
+; RV64-NEXT: bnez a2, .LBB6_5
+; RV64-NEXT: # %bb.1: # %else
+; RV64-NEXT: andi a2, a1, 2
+; RV64-NEXT: bnez a2, .LBB6_6
+; RV64-NEXT: .LBB6_2: # %else2
+; RV64-NEXT: andi a2, a1, 4
+; RV64-NEX...
[truncated]
|
@@ -659,7 +660,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, | |||
} else { | |||
Value *NewPtr = | |||
Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex); | |||
InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1), | |||
InsertElt = Builder.CreateAlignedLoad(EltTy, NewPtr, Alignment, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need to use commonAlignment(Alignment, EltTy->getSizeInBits() / 8)
. I believe that is what the scalarization of masked.load/store/gather/scatter do.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
; CHECK-NEXT: vse8.v v8, (a0) | ||
; CHECK-NEXT: .LBB0_2: # %else | ||
; CHECK-NEXT: ret | ||
call void @llvm.masked.compressstore.v1i8(<1 x i8> %v, ptr %base, <1 x i1> %mask) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The alignment attribute should be on parameter in the intrinsic call. See for example, unaligned_vpload_nxv1i64_allones_mask
in vpload.ll
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -627,6 +627,7 @@ static void scalarizeMaskedExpandLoad(const DataLayout &DL, CallInst *CI, | |||
Value *Ptr = CI->getArgOperand(0); | |||
Value *Mask = CI->getArgOperand(1); | |||
Value *PassThru = CI->getArgOperand(2); | |||
Align Alignment = Ptr->getPointerAlignment(DL); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should be using CI->getParamAlign here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…ndload sections. align attribute is used for masked.compress/expandload in commit llvm#83519, llvm#83763, llvm#83516.
Previously we used Align(1) for all scalarized load/stores from masked.compressstore/expandload.
For targets not supporting unaligned accesses, it make backend need to split
aligned large width loads/stores to byte loads/stores.
To solve this performance issue, this patch preserves the alignment of base
pointer after scalarizing.