Skip to content

Commit d515664

Browse files
committed
Rebase and add work around for duplicate extends
1 parent 80f7258 commit d515664

File tree

5 files changed

+51
-44
lines changed

5 files changed

+51
-44
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "llvm/ADT/DenseMap.h"
3030
#include "llvm/ADT/SmallBitVector.h"
3131
#include "llvm/ADT/SmallPtrSet.h"
32+
#include "llvm/ADT/SmallSet.h"
3233
#include "llvm/ADT/SmallVector.h"
3334
#include "llvm/ADT/Twine.h"
3435
#include "llvm/ADT/ilist.h"
@@ -3007,8 +3008,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
30073008
{Ext0, Ext1, Mul, Red}) {}
30083009

30093010
~VPExpressionRecipe() override {
3010-
for (auto *R : reverse(ExpressionRecipes))
3011-
delete R;
3011+
SmallSet<VPSingleDefRecipe *, 4> ExpressionRecipesSeen;
3012+
for (auto *R : reverse(ExpressionRecipes)) {
3013+
if (ExpressionRecipesSeen.insert(R).second)
3014+
delete R;
3015+
}
30123016
for (VPValue *T : LiveInPlaceholders)
30133017
delete T;
30143018
}

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2741,9 +2741,8 @@ VPExpressionRecipe::VPExpressionRecipe(
27412741
ExpressionTypes ExpressionType,
27422742
ArrayRef<VPSingleDefRecipe *> ExpressionRecipes)
27432743
: VPSingleDefRecipe(VPDef::VPExpressionSC, {}, {}),
2744-
ExpressionRecipes(SetVector<VPSingleDefRecipe *>(
2745-
ExpressionRecipes.begin(), ExpressionRecipes.end())
2746-
.takeVector()),
2744+
ExpressionRecipes(SmallVector<VPSingleDefRecipe *>(
2745+
ExpressionRecipes.begin(), ExpressionRecipes.end())),
27472746
ExpressionType(ExpressionType) {
27482747
assert(!ExpressionRecipes.empty() && "Nothing to combine?");
27492748
assert(
@@ -2777,25 +2776,43 @@ VPExpressionRecipe::VPExpressionRecipe(
27772776
R->removeFromParent();
27782777
}
27792778

2779+
// Keep track of how many instances of each recipe occur in the recipe list
2780+
SmallMapVector<VPSingleDefRecipe *, unsigned, 4> ExpressionRecipeCounts;
2781+
for (auto *R : ExpressionRecipes) {
2782+
auto *F = ExpressionRecipeCounts.find(R);
2783+
if (F == ExpressionRecipeCounts.end())
2784+
ExpressionRecipeCounts.insert(std::make_pair(R, 1));
2785+
else
2786+
F->second++;
2787+
}
2788+
27802789
// Internalize all external operands to the expression recipes. To do so,
27812790
// create new temporary VPValues for all operands defined by a recipe outside
27822791
// the expression. The original operands are added as operands of the
27832792
// VPExpressionRecipe itself.
27842793
for (auto *R : ExpressionRecipes) {
2794+
auto *F = ExpressionRecipeCounts.find(R);
2795+
F->second--;
27852796
for (const auto &[Idx, Op] : enumerate(R->operands())) {
27862797
auto *Def = Op->getDefiningRecipe();
27872798
if (Def && ExpressionRecipesAsSetOfUsers.contains(Def))
27882799
continue;
27892800
addOperand(Op);
2790-
LiveInPlaceholders.push_back(new VPValue());
2791-
R->setOperand(Idx, LiveInPlaceholders.back());
2801+
auto *Tmp = new VPValue();
2802+
Tmp->setUnderlyingValue(Op->getUnderlyingValue());
2803+
LiveInPlaceholders.push_back(Tmp);
2804+
// Only modify this recipe's operands if it's the last time it occurs in
2805+
// the recipe list
2806+
if (F->second == 0)
2807+
R->setOperand(Idx, Tmp);
27922808
}
27932809
}
27942810
}
27952811

27962812
void VPExpressionRecipe::decompose() {
27972813
for (auto *R : ExpressionRecipes)
2798-
R->insertBefore(this);
2814+
if (!R->getParent())
2815+
R->insertBefore(this);
27992816

28002817
for (const auto &[Idx, Op] : enumerate(operands()))
28012818
LiveInPlaceholders[Idx]->replaceAllUsesWith(Op);
@@ -2850,7 +2867,7 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
28502867
cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
28512868
Instruction::ZExt,
28522869
Opcode, RedTy, SrcVecTy, Ctx.CostKind);
2853-
}
2870+
}
28542871
}
28552872
llvm_unreachable("Unknown VPExpressionRecipe::ExpressionTypes enum");
28562873
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3461,35 +3461,24 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
34613461

34623462
VPValue *VecOp = Red->getVecOp();
34633463
VPValue *A, *B;
3464-
// Try to match reduce.add(mul(...)).
3464+
// Try to match reduce.add/sub(mul(...)).
34653465
if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
34663466
auto *RecipeA =
34673467
dyn_cast_if_present<VPWidenCastRecipe>(A->getDefiningRecipe());
34683468
auto *RecipeB =
34693469
dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
34703470
auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
34713471

3472-
// Match reduce.add(mul(ext, ext)).
3472+
// Match reduce.add/sub(mul(ext, ext)).
34733473
if (RecipeA && RecipeB &&
34743474
(RecipeA->getOpcode() == RecipeB->getOpcode() || IsPartialReduction) &&
34753475
match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
34763476
match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
3477-
<<<<<<< HEAD
3478-
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
3479-
Instruction::CastOps::ZExt,
3480-
Mul, RecipeA, RecipeB, nullptr)) {
3481-
return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
3482-
=======
34833477
(IsPartialReduction ||
34843478
IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
34853479
Instruction::CastOps::ZExt,
3486-
MulR, RecipeA, RecipeB, nullptr, Sub))) {
3487-
if (Sub)
3488-
return new VPExpressionRecipe(
3489-
RecipeA, RecipeB, MulR,
3490-
cast<VPWidenRecipe>(Sub->getDefiningRecipe()), Red);
3491-
return new VPExpressionRecipe(RecipeA, RecipeB, MulR, Red);
3492-
>>>>>>> e0a59862bff8 ([LV] Bundle partial reductions inside VPExpressionRecipe)
3480+
Mul, RecipeA, RecipeB, nullptr))) {
3481+
return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
34933482
}
34943483
// Match reduce.add(mul).
34953484
if (IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
2323
; IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
2424
; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
2525
; IC2-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
26-
; IC2-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
27-
; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
28-
; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
26+
; IC2-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
2927
; IC2-NEXT: [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
30-
; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
31-
; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
28+
; IC2-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
29+
; IC2-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
30+
; IC2-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
31+
; IC2-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]])
3232
; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
3333
; IC2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
3434
; IC2-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
8080
; IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
8181
; IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
8282
; IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
83-
; IC4-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
83+
; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
84+
; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
85+
; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
8486
; IC4-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
85-
; IC4-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
86-
; IC4-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
87-
; IC4-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]]
8887
; IC4-NEXT: [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
89-
; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
90-
; IC4-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
91-
; IC4-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
9288
; IC4-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
89+
; IC4-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
90+
; IC4-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]]
9391
; IC4-NEXT: [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
94-
; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
92+
; IC4-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
93+
; IC4-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]]
94+
; IC4-NEXT: [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
9595
; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
9696
; IC4-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
9797
; IC4-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]

llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,26 +23,23 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
2323
; CHECK-NEXT: <x1> vector loop: {
2424
; CHECK-NEXT: vector.body:
2525
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
26-
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
26+
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4)
2727
; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
2828
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
2929
; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
3030
; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]>
31-
; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
3231
; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
3332
; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
3433
; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]>
35-
; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
36-
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
37-
; CHECK-NEXT: PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul>
34+
; CHECK-NEXT: EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))
3835
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
3936
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
4037
; CHECK-NEXT: No successors
4138
; CHECK-NEXT: }
4239
; CHECK-NEXT: Successor(s): middle.block
4340
; CHECK-EMPTY:
4441
; CHECK-NEXT: middle.block:
45-
; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
42+
; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]>
4643
; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
4744
; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]>
4845
; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
8986
; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
9087
; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
9188
; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a>
92-
; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
9389
; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
9490
; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b>
9591
; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
92+
; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
9693
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
9794
; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
9895
; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>

0 commit comments

Comments
 (0)