Skip to content

Commit f03b6f9

Browse files
igogo-x86llvmbot
authored andcommitted
[LoopVectorize] Fix incorrect order of invariant stores when there are multiple reductions.
When a loop has multiple reductions, each with an intermediate invariant store, the order in which those reductions are processed is not considered. This can result in the invariant stores outside the loop not preserving the original order. This patch sorts VPReductionPHIRecipes by the order in which they have stores in the original loop before running `InnerLoopVectorizer::fixReduction` function, and it helps to maintain the correct order of stores. Fixes llvm/llvm-project#64047 Differential Revision: https://reviews.llvm.org/D157631 (cherry picked from commit ac65fb869977185b44757b94dc5130bd08c6f7e2)
1 parent 48497ec commit f03b6f9

File tree

2 files changed

+42
-10
lines changed

2 files changed

+42
-10
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3781,10 +3781,44 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
37813781
// the incoming edges.
37823782
VPBasicBlock *Header =
37833783
State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3784+
3785+
// Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
3786+
// sank outside of the loop would keep the same order as they had in the
3787+
// original loop.
3788+
SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
37843789
for (VPRecipeBase &R : Header->phis()) {
37853790
if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3786-
fixReduction(ReductionPhi, State);
3787-
else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3791+
ReductionPHIList.emplace_back(ReductionPhi);
3792+
}
3793+
stable_sort(ReductionPHIList, [this](const VPReductionPHIRecipe *R1,
3794+
const VPReductionPHIRecipe *R2) {
3795+
auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
3796+
auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
3797+
3798+
// If neither of the recipes has an intermediate store, keep the order the
3799+
// same.
3800+
if (!IS1 && !IS2)
3801+
return false;
3802+
3803+
// If only one of the recipes has an intermediate store, then move it
3804+
// towards the beginning of the list.
3805+
if (IS1 && !IS2)
3806+
return true;
3807+
3808+
if (!IS1 && IS2)
3809+
return false;
3810+
3811+
// If both recipes have an intermediate store, then the recipe with the
3812+
// later store should be processed earlier. So it should go to the beginning
3813+
// of the list.
3814+
return DT->dominates(IS2, IS1);
3815+
});
3816+
3817+
for (VPReductionPHIRecipe *ReductionPhi : ReductionPHIList)
3818+
fixReduction(ReductionPhi, State);
3819+
3820+
for (VPRecipeBase &R : Header->phis()) {
3821+
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
37883822
fixFixedOrderRecurrence(FOR, State);
37893823
}
37903824
}

llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -561,14 +561,13 @@ exit: ; preds = %for.body
561561
}
562562

563563
; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
564-
; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
565564
; See https://github.com/llvm/llvm-project/issues/64047
566565
define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
567566
; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
568567
; CHECK: middle.block:
569-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
568+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
570569
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst, align 4
571-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
570+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
572571
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst, align 4
573572
;
574573
entry:
@@ -622,14 +621,13 @@ exit:
622621
}
623622

624623
; Same as above but storing is done to two different pointers and they can be aliased
625-
; FIXME: This tests currently shows incorrect behavior and it will fixed in the following patch
626624
define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
627625
; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
628626
; CHECK: middle.block:
629-
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP1:%.*]])
630-
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst2, align 4
631-
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
632-
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst1, align 4
627+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1:%.*]])
628+
; CHECK-NEXT: store i32 [[TMP2]], ptr %dst1, align 4
629+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
630+
; CHECK-NEXT: store i32 [[TMP4]], ptr %dst2, align 4
633631
;
634632
entry:
635633
br label %for.body

0 commit comments

Comments
 (0)