Skip to content

Commit ea52fe4

Browse files
committed
[VPlan] Use pointer to member 0 as VPInterleaveRecipe's pointer arg.
Update VPInterleaveRecipe to always use the pointer to member 0 as pointer argument. This in many cases helps to remove unneeded index adjustments and simplifies VPInterleaveRecipe::execute. In some rare cases, the address of member 0 does not dominate the insert position of the interleave group. In those cases a PtrAdd VPInstruction is emitted to compute the address of member 0 based on the address of the insert position. Alternatively we could hoist the recipe computing the address of member 0.
1 parent 7912abe commit ea52fe4

10 files changed

+90
-60
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8993,8 +8993,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
89938993
// Interleave memory: for each Interleave Group we marked earlier as relevant
89948994
// for this VPlan, replace the Recipes widening its memory instructions with a
89958995
// single VPInterleaveRecipe at its insertion point.
8996-
VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
8997-
CM.isScalarEpilogueAllowed());
8996+
VPlanTransforms::createInterleaveGroups(
8997+
*Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
89988998

89998999
for (ElementCount VF : Range)
90009000
Plan->addVF(VF);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,13 @@ class VPInstruction : public VPRecipeWithIRFlags {
13301330
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
13311331
}
13321332

1333+
VPInstruction(VPValue *Ptr, VPValue *Offset, bool InBounds, DebugLoc DL = {},
1334+
const Twine &Name = "")
1335+
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
1336+
ArrayRef<VPValue *>({Ptr, Offset}),
1337+
GEPFlagsTy(InBounds), DL),
1338+
Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}
1339+
13331340
VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
13341341
FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
13351342

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,9 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
646646
"can only generate first lane for PtrAdd");
647647
Value *Ptr = State.get(getOperand(0), Part, /* IsScalar */ true);
648648
Value *Addend = State.get(getOperand(1), Part, /* IsScalar */ true);
649-
return Builder.CreatePtrAdd(Ptr, Addend, Name);
649+
return Builder.CreatePtrAdd(Ptr, Addend, Name,
650+
isInBounds() ? GEPNoWrapFlags::inBounds()
651+
: GEPNoWrapFlags::none());
650652
}
651653
case VPInstruction::ResumePhi: {
652654
if (Part != 0)
@@ -2393,7 +2395,6 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
23932395

23942396
// Prepare for the new pointers.
23952397
SmallVector<Value *, 2> AddrParts;
2396-
unsigned Index = Group->getIndex(Instr);
23972398

23982399
// TODO: extend the masked interleaved-group support to reversed access.
23992400
VPValue *BlockInMask = getMask();
@@ -2413,10 +2414,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
24132414
Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
24142415
Idx = State.Builder.CreateMul(Idx,
24152416
State.Builder.getInt32(Group->getFactor()));
2416-
Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
24172417
Idx = State.Builder.CreateNeg(Idx);
2418-
} else
2419-
Idx = State.Builder.getInt32(-Index);
2418+
} else {
2419+
// TODO: Drop redundant 0-index GEP as follow-up.
2420+
Idx = State.Builder.getInt32(0);
2421+
}
24202422

24212423
VPValue *Addr = getAddr();
24222424
for (unsigned Part = 0; Part < State.UF; Part++) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -596,8 +596,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
596596
Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
597597
SE, nullptr, StartV, StepV, InsertPt);
598598

599-
auto *Recipe = new VPInstruction(VPInstruction::PtrAdd,
600-
{PtrIV->getStartValue(), Steps},
599+
auto *Recipe = new VPInstruction(PtrIV->getStartValue(), Steps, false,
601600
PtrIV->getDebugLoc(), "next.gep");
602601

603602
Recipe->insertAfter(Steps);
@@ -1522,14 +1521,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
15221521
}
15231522

15241523
void VPlanTransforms::createInterleaveGroups(
1525-
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
1524+
VPlan &Plan,
1525+
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
1526+
&InterleaveGroups,
15261527
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
1528+
if (InterleaveGroups.empty())
1529+
return;
1530+
15271531
// Interleave memory: for each Interleave Group we marked earlier as relevant
15281532
// for this VPlan, replace the Recipes widening its memory instructions with a
15291533
// single VPInterleaveRecipe at its insertion point.
1534+
VPDominatorTree VPDT;
1535+
VPDT.recalculate(Plan);
15301536
for (const auto *IG : InterleaveGroups) {
1531-
auto *Recipe =
1532-
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
15331537
SmallVector<VPValue *, 4> StoredValues;
15341538
for (unsigned i = 0; i < IG->getFactor(); ++i)
15351539
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
@@ -1539,9 +1543,39 @@ void VPlanTransforms::createInterleaveGroups(
15391543

15401544
bool NeedsMaskForGaps =
15411545
IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
1542-
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
1543-
Recipe->getMask(), NeedsMaskForGaps);
1544-
VPIG->insertBefore(Recipe);
1546+
1547+
Instruction *IRInsertPos = IG->getInsertPos();
1548+
auto *InsertPos =
1549+
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
1550+
VPRecipeBase *IP = InsertPos;
1551+
1552+
// Get or create the start address for the interleave group.
1553+
auto *Start =
1554+
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
1555+
VPValue *Addr = Start->getAddr();
1556+
if (!VPDT.properlyDominates(Addr->getDefiningRecipe(), InsertPos)) {
1557+
bool InBounds = false;
1558+
if (auto *gep = dyn_cast<GetElementPtrInst>(
1559+
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
1560+
InBounds = gep->isInBounds();
1561+
1562+
// We cannot re-use the address of the first member because it does not
1563+
// dominate the insert position. Use the address of the insert position
1564+
// and create a PtrAdd to adjust the index to start at the first member.
1565+
APInt Offset(32,
1566+
getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
1567+
IG->getIndex(IRInsertPos),
1568+
/*IsSigned=*/true);
1569+
VPValue *OffsetVPV = Plan.getOrAddLiveIn(
1570+
ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
1571+
Addr = new VPInstruction(InsertPos->getAddr(), OffsetVPV, InBounds);
1572+
Addr->getDefiningRecipe()->insertAfter(InsertPos);
1573+
IP = Addr->getDefiningRecipe();
1574+
}
1575+
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
1576+
InsertPos->getMask(), NeedsMaskForGaps);
1577+
VPIG->insertAfter(IP);
1578+
15451579
unsigned J = 0;
15461580
for (unsigned i = 0; i < IG->getFactor(); ++i)
15471581
if (Instruction *Member = IG->getMember(i)) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,9 @@ struct VPlanTransforms {
112112
// widening its memory instructions with a single VPInterleaveRecipe at its
113113
// insertion point.
114114
static void createInterleaveGroups(
115-
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
115+
VPlan &Plan,
116+
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
117+
&InterleaveGroups,
116118
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);
117119
};
118120

llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -89,21 +89,20 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
8989
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
9090
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
9191
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
92+
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP41]]
9293
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
9394
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
9495
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
95-
; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
96-
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
97-
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
96+
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 0
9897
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
9998
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
10099
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
101100
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
102101
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
103102
; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
104103
; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
105-
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
106-
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
104+
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP41]]
105+
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 0
107106
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
108107
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
109108
; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
@@ -266,8 +265,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
266265
; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x float> [[TMP34]], zeroinitializer
267266
; CHECK-NEXT: [[TMP36:%.*]] = fadd <2 x float> [[STRIDED_VEC16]], [[STRIDED_VEC20]]
268267
; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x float> [[TMP36]], zeroinitializer
269-
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[TMP28]], i64 12
270-
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP38]], i32 -3
268+
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[TMP28]], i32 0
271269
; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <2 x float> [[TMP31]], <2 x float> [[TMP33]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
272270
; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <2 x float> [[TMP35]], <2 x float> [[TMP37]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
273271
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <4 x float> [[TMP40]], <4 x float> [[TMP41]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -490,11 +488,10 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
490488
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP53]], align 4
491489
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
492490
; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
491+
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP50]]
493492
; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
494493
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !alias.scope [[META6:![0-9]+]]
495-
; CHECK-NEXT: [[TMP55:%.*]] = or disjoint i64 [[TMP50]], 7
496-
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP55]]
497-
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 -7
494+
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP56]], i32 0
498495
; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
499496
; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
500497
; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,13 +1419,11 @@ define dso_local void @masked_strided2(ptr noalias nocapture readonly %p, ptr no
14191419
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
14201420
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
14211421
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
1422-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
14231422
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC1]])
1423+
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP1]]
14241424
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
1425-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP3]]
1426-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 -1
14271425
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
1428-
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
1426+
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
14291427
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
14301428
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
14311429
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
@@ -2555,13 +2553,11 @@ define dso_local void @masked_strided2_unknown_tc(ptr noalias nocapture readonly
25552553
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
25562554
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
25572555
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2558-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = or disjoint i32 [[TMP2]], 1
25592556
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
2557+
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP2]]
25602558
; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]]
2561-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[TMP5]]
2562-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 -1
25632559
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2564-
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP9]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
2560+
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
25652561
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
25662562
; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
25672563
; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -2989,13 +2985,11 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(ptr noalias noca
29892985
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP2]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison)
29902986
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
29912987
; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
2992-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = or disjoint i32 [[TMP1]], 1
29932988
; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[STRIDED_VEC]], <8 x i8> [[STRIDED_VEC3]])
2989+
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP1]]
29942990
; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = sub <8 x i8> zeroinitializer, [[TMP4]]
2995-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q:%.*]], i32 [[TMP3]]
2996-
; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 -1
29972991
; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
2998-
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP7]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
2992+
; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[INTERLEAVED_VEC]], ptr [[TMP6]], i32 1, <16 x i1> [[INTERLEAVED_MASK]])
29992993
; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
30002994
; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
30012995
; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP8]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]

0 commit comments

Comments
 (0)