Skip to content

Commit 234e81e

Browse files
committed
Fold reduce.add(zext(mul(sext(A), sext(B)))) into MulAccRecipe when A == B
For the future refactor of avoiding reference underlying instructions and mismatched opcode and the entend instruction in the new added pattern, removed passing UI when creating VPWidenCastRecipe. This removed will lead to dupicate extend instruction created after loop vectorizer when there are two reduction patterns exist in the same loop. This redundant instruction might be removed after LV.
1 parent efd2236 commit 234e81e

File tree

4 files changed

+24
-31
lines changed

4 files changed

+24
-31
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9396,29 +9396,28 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
93969396
// reduce.add(mul(ext, ext)) can folded into VPMulAccRecipe
93979397
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B))) &&
93989398
!VecOp->hasMoreThanOneUniqueUser()) {
9399-
VPRecipeBase *RecipeA = A->getDefiningRecipe();
9400-
VPRecipeBase *RecipeB = B->getDefiningRecipe();
9399+
VPWidenCastRecipe *RecipeA =
9400+
dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
9401+
VPWidenCastRecipe *RecipeB =
9402+
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
94019403
if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
94029404
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
9403-
cast<VPWidenCastRecipe>(RecipeA)->getOpcode() ==
9404-
cast<VPWidenCastRecipe>(RecipeB)->getOpcode() &&
9405-
!A->hasMoreThanOneUniqueUser() &&
9406-
!B->hasMoreThanOneUniqueUser()) {
9405+
(RecipeA->getOpcode() == RecipeB->getOpcode() || A == B)) {
94079406
return new VPMulAccRecipe(
94089407
RdxDesc, CurrentLinkI, PreviousLink, CondOp,
94099408
CM.useOrderedReductions(RdxDesc),
9410-
cast<VPWidenRecipe>(VecOp->getDefiningRecipe()),
9411-
cast<VPWidenCastRecipe>(RecipeA),
9412-
cast<VPWidenCastRecipe>(RecipeB));
9409+
cast<VPWidenRecipe>(VecOp->getDefiningRecipe()), RecipeA,
9410+
RecipeB);
94139411
} else {
94149412
// Matched reduce.add(mul(...))
94159413
return new VPMulAccRecipe(
94169414
RdxDesc, CurrentLinkI, PreviousLink, CondOp,
94179415
CM.useOrderedReductions(RdxDesc),
94189416
cast<VPWidenRecipe>(VecOp->getDefiningRecipe()));
94199417
}
9420-
// Matched reduce.add(ext(mul(ext, ext)))
9421-
// Note that 3 extend instructions must have same opcode.
9418+
// Matched reduce.add(ext(mul(ext(A), ext(B))))
9419+
// Note that 3 extend instructions must have same opcode or A == B
9420+
// which can be transform to reduce.add(zext(mul(sext(A), sext(B)))).
94229421
} else if (match(VecOp,
94239422
m_ZExtOrSExt(m_Mul(m_ZExtOrSExt(m_VPValue()),
94249423
m_ZExtOrSExt(m_VPValue())))) &&
@@ -9431,11 +9430,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94319430
cast<VPWidenCastRecipe>(Mul->getOperand(0)->getDefiningRecipe());
94329431
VPWidenCastRecipe *Ext1 =
94339432
cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
9434-
if (Ext->getOpcode() == Ext0->getOpcode() &&
9435-
Ext0->getOpcode() == Ext1->getOpcode() &&
9436-
!Mul->hasMoreThanOneUniqueUser() &&
9437-
!Ext0->hasMoreThanOneUniqueUser() &&
9438-
!Ext1->hasMoreThanOneUniqueUser()) {
9433+
if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
9434+
Ext0->getOpcode() == Ext1->getOpcode()) {
94399435
return new VPMulAccRecipe(
94409436
RdxDesc, CurrentLinkI, PreviousLink, CondOp,
94419437
CM.useOrderedReductions(RdxDesc),
@@ -9447,8 +9443,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
94479443
};
94489444
auto TryToMatchExtendedReduction = [&]() -> VPSingleDefRecipe * {
94499445
VPValue *A;
9450-
if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
9451-
!VecOp->hasMoreThanOneUniqueUser()) {
9446+
if (match(VecOp, m_ZExtOrSExt(m_VPValue(A)))) {
94529447
return new VPExtendedReductionRecipe(
94539448
RdxDesc, CurrentLinkI, PreviousLink,
94549449
cast<VPWidenCastRecipe>(VecOp), CondOp,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -542,14 +542,14 @@ void VPlanTransforms::prepareExecute(VPlan &Plan) {
542542
auto *MulAcc = cast<VPMulAccRecipe>(&R);
543543
VPValue *Op0, *Op1;
544544
if (MulAcc->isExtended()) {
545-
Op0 = new VPWidenCastRecipe(
546-
MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
547-
MulAcc->getResultType(), *MulAcc->getExt0Instr());
545+
Op0 =
546+
new VPWidenCastRecipe(MulAcc->getExtOpcode(), MulAcc->getVecOp0(),
547+
MulAcc->getResultType());
548548
Op0->getDefiningRecipe()->insertBefore(MulAcc);
549549
if (!MulAcc->isSameExtend()) {
550-
Op1 = new VPWidenCastRecipe(
551-
MulAcc->getExtOpcode(), MulAcc->getVecOp1(),
552-
MulAcc->getResultType(), *MulAcc->getExt1Instr());
550+
Op1 = new VPWidenCastRecipe(MulAcc->getExtOpcode(),
551+
MulAcc->getVecOp1(),
552+
MulAcc->getResultType());
553553
Op1->getDefiningRecipe()->insertBefore(MulAcc);
554554
} else {
555555
Op1 = Op0;
@@ -567,8 +567,7 @@ void VPlanTransforms::prepareExecute(VPlan &Plan) {
567567
if (auto *OuterExtInstr = MulAcc->getExtInstr())
568568
VecOp = new VPWidenCastRecipe(
569569
MulAcc->getExtOpcode(), Mul,
570-
MulAcc->getRecurrenceDescriptor().getRecurrenceType(),
571-
*OuterExtInstr);
570+
MulAcc->getRecurrenceDescriptor().getRecurrenceType());
572571
else
573572
VecOp = Mul;
574573
auto *Red = new VPReductionRecipe(

llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1535,7 +1535,8 @@ define i64 @mla_and_add_together_16_64(ptr nocapture noundef readonly %x, i32 no
15351535
; CHECK-NEXT: [[TMP3:%.*]] = zext nneg <8 x i32> [[TMP2]] to <8 x i64>
15361536
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
15371537
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP4]], [[VEC_PHI]]
1538-
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
1538+
; CHECK-NEXT: [[TMP10:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
1539+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]])
15391540
; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI1]]
15401541
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
15411542
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/reduction-inloop.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,15 +1206,13 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read
12061206
; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
12071207
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[H:%.*]], i64 [[TMP0]]
12081208
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
1209-
; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
12101209
; CHECK-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[WIDE_LOAD]], splat (i8 31)
12111210
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i8> [[TMP2]], splat (i8 3)
12121211
; CHECK-NEXT: [[TMP4:%.*]] = udiv <4 x i8> [[TMP3]], splat (i8 31)
12131212
; CHECK-NEXT: [[TMP5:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
1214-
; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
1215-
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]])
1213+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
12161214
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]]
1217-
; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[DOTNOT]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
1215+
; CHECK-NEXT: [[TMP9:%.*]] = zext nneg <4 x i8> [[TMP4]] to <4 x i32>
12181216
; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
12191217
; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[TMP8]]
12201218
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4

0 commit comments

Comments
 (0)