Skip to content

Commit fdb9f96

Browse files
committed
[LV] Consider earlier stores to invariant reduction address as dead.
For invariant stores to an address of a reduction, only the latest store will be generated outside the loop. Consider earlier stores as dead. This fixes a difference between the legacy and VPlan-based cost model. Fixes #96294.
1 parent 5262865 commit fdb9f96

File tree

2 files changed

+83
-3
lines changed

2 files changed

+83
-3
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6723,14 +6723,18 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
67236723
return RequiresScalarEpilogue &&
67246724
!TheLoop->contains(cast<Instruction>(U)->getParent());
67256725
};
6726+
MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
67266727
for (BasicBlock *BB : TheLoop->blocks())
67276728
for (Instruction &I : *BB) {
67286729
// Find all stores to invariant variables. Since they are going to sink
67296730
// outside the loop we do not need calculate cost for them.
67306731
StoreInst *SI;
67316732
if ((SI = dyn_cast<StoreInst>(&I)) &&
6732-
Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
6733+
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
67336734
ValuesToIgnore.insert(&I);
6735+
auto I = DeadInvariantStoreOps.insert({SI->getPointerOperand(), {}});
6736+
I.first->second.push_back(SI->getValueOperand());
6737+
}
67346738

67356739
if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I))
67366740
continue;
@@ -6771,6 +6775,10 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
67716775
DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end());
67726776
}
67736777

6778+
for (const auto &[_, Ops] : DeadInvariantStoreOps) {
6779+
for (Value *Op : ArrayRef(Ops).drop_back())
6780+
DeadOps.push_back(Op);
6781+
}
67746782
// Mark ops that would be trivially dead and are only used by ignored
67756783
// instructions as free.
67766784
BasicBlock *Header = TheLoop->getHeader();
@@ -6781,8 +6789,8 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
67816789
(isa<PHINode>(Op) && Op->getParent() == Header) ||
67826790
!wouldInstructionBeTriviallyDead(Op, TLI) ||
67836791
any_of(Op->users(), [this, IsLiveOutDead](User *U) {
6784-
return !VecValuesToIgnore.contains(U) && ValuesToIgnore.contains(U) &&
6785-
!IsLiveOutDead(U);
6792+
return !VecValuesToIgnore.contains(U) &&
6793+
!ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
67866794
}))
67876795
continue;
67886796

llvm/test/Transforms/LoopVectorize/X86/cost-model.ll

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -859,6 +859,78 @@ exit:
859859
ret i64 %1
860860
}
861861

862+
; Test case for https://github.com/llvm/llvm-project/issues/96294 with a stored
863+
; reduction which overwrites an earlier store.
864+
define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
865+
; CHECK-LABEL: @reduction_store(
866+
; CHECK-NEXT: entry:
867+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
868+
; CHECK: vector.ph:
869+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i1> poison, i1 [[X:%.*]], i64 0
870+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT]], <8 x i1> poison, <8 x i32> zeroinitializer
871+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
872+
; CHECK: vector.body:
873+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
874+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ <i32 0, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
875+
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
876+
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP0]]
877+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
878+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i1> [[BROADCAST_SPLAT]] to <8 x i64>
879+
; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i64> [[TMP3]], <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
880+
; CHECK-NEXT: [[TMP5:%.*]] = trunc <8 x i64> [[TMP4]] to <8 x i32>
881+
; CHECK-NEXT: [[TMP6]] = and <8 x i32> [[VEC_PHI]], [[TMP5]]
882+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
883+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24
884+
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
885+
; CHECK: middle.block:
886+
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP6]])
887+
; CHECK-NEXT: store i32 [[TMP8]], ptr [[DST:%.*]], align 4
888+
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
889+
; CHECK: scalar.ph:
890+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
891+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
892+
; CHECK-NEXT: br label [[LOOP:%.*]]
893+
; CHECK: loop:
894+
; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
895+
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
896+
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
897+
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
898+
; CHECK-NEXT: [[L_AND:%.*]] = and i32 [[L]], 3
899+
; CHECK-NEXT: store i32 [[L_AND]], ptr [[DST]], align 4
900+
; CHECK-NEXT: [[X_EXT:%.*]] = zext i1 [[X]] to i64
901+
; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[X_EXT]], 12
902+
; CHECK-NEXT: [[T:%.*]] = trunc i64 [[LSHR]] to i32
903+
; CHECK-NEXT: [[RED_NEXT]] = and i32 [[RED]], [[T]]
904+
; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[DST]], align 4
905+
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
906+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 29
907+
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]]
908+
; CHECK: exit:
909+
; CHECK-NEXT: ret void
910+
;
911+
entry:
912+
br label %loop
913+
914+
loop:
915+
%red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
916+
%iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
917+
%gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
918+
%l = load i32, ptr %gep.src
919+
%l.and = and i32 %l, 3
920+
store i32 %l.and, ptr %dst, align 4
921+
%x.ext = zext i1 %x to i64
922+
%lshr = lshr i64 %x.ext, 12
923+
%t = trunc i64 %lshr to i32
924+
%red.next = and i32 %red, %t
925+
store i32 %red.next, ptr %dst, align 4
926+
%iv.next = add i32 %iv, 1
927+
%ec = icmp eq i32 %iv, 29
928+
br i1 %ec, label %exit, label %loop
929+
930+
exit:
931+
ret void
932+
}
933+
862934
declare void @llvm.assume(i1 noundef) #0
863935

864936
attributes #0 = { "target-cpu"="penryn" }

0 commit comments

Comments
 (0)