-
Notifications
You must be signed in to change notification settings - Fork 15k
[VPlan] EVL transform VPVectorEndPointerRecipe alongisde load/store recipes. NFC #152542
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] EVL transform VPVectorEndPointerRecipe alongisde load/store recipes. NFC #152542
Conversation
…ecipes. NFC This is the first step in untangling the variable step transform and header mask optimizations as described in llvm#152541. Currently we replace all VF users globally in the plan, including VPVectorEndPointerRecipe. However this leaves reversed loads and stores in an incorrect state until they are adjusted in optimizeMaskToEVL. This moves the VPVectorEndPointerRecipe transform so that it is updated in lockstep with the actual load/store recipe. One thought that crossed my mind was that VPInterleaveRecipe could also use VPVectorEndPointerRecipe, in which case we would also be computing the wrong address because we don't transform it to an EVL recipe which accounts for the reversed address. However I've added a test and it looks like we don't support reversed interleave groups on RISC-V.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) Changes(git log -n1 --format=%b) Full diff: https://github.com/llvm/llvm-project/pull/152542.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c42cdd5365108..4af4a214ae906 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3076,10 +3076,11 @@ struct LLVM_ABI_FOR_TEST VPWidenLoadRecipe final : public VPWidenMemoryRecipe,
/// using the address to load from, the explicit vector length and an optional
/// mask.
struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
- VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask)
+ VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue *Addr, VPValue &EVL,
+ VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(),
- {L.getAddr(), &EVL}, L.isConsecutive(),
- L.isReverse(), L, L.getDebugLoc()),
+ {Addr, &EVL}, L.isConsecutive(), L.isReverse(), L,
+ L.getDebugLoc()),
VPValue(this, &getIngredient()) {
setMask(Mask);
}
@@ -3157,11 +3158,11 @@ struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
/// using the value to store, the address to store to, the explicit vector
/// length and an optional mask.
struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
- VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask)
+ VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue *Addr, VPValue &EVL,
+ VPValue *Mask)
: VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(),
- {S.getAddr(), S.getStoredValue(), &EVL},
- S.isConsecutive(), S.isReverse(), S,
- S.getDebugLoc()) {
+ {Addr, S.getStoredValue(), &EVL}, S.isConsecutive(),
+ S.isReverse(), S, S.getDebugLoc()) {
setMask(Mask);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1c8bd6c7eefc0..6b364903c6c1c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2130,6 +2130,8 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
VPRecipeBase &CurRecipe,
VPTypeAnalysis &TypeInfo,
VPValue &AllOneMask, VPValue &EVL) {
+ // FIXME: Don't transform recipes to EVL recipes if they're not masked by the
+ // header mask.
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
assert(OrigMask && "Unmasked recipe when folding tail");
// HeaderMask will be handled using EVL.
@@ -2139,14 +2141,28 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return HeaderMask == OrigMask ? nullptr : OrigMask;
};
+ /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
+ auto GetNewAddr = [&CurRecipe, &EVL](VPValue *Addr) -> VPValue * {
+ auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr);
+ if (!EndPtr)
+ return Addr;
+ assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF());
+ VPVectorEndPointerRecipe *EVLAddr = EndPtr->clone();
+ EVLAddr->insertBefore(&CurRecipe);
+ EVLAddr->setOperand(1, &EVL);
+ return EVLAddr;
+ };
+
return TypeSwitch<VPRecipeBase *, VPRecipeBase *>(&CurRecipe)
.Case<VPWidenLoadRecipe>([&](VPWidenLoadRecipe *L) {
VPValue *NewMask = GetNewMask(L->getMask());
- return new VPWidenLoadEVLRecipe(*L, EVL, NewMask);
+ VPValue *NewAddr = GetNewAddr(L->getAddr());
+ return new VPWidenLoadEVLRecipe(*L, NewAddr, EVL, NewMask);
})
.Case<VPWidenStoreRecipe>([&](VPWidenStoreRecipe *S) {
VPValue *NewMask = GetNewMask(S->getMask());
- return new VPWidenStoreEVLRecipe(*S, EVL, NewMask);
+ VPValue *NewAddr = GetNewAddr(S->getAddr());
+ return new VPWidenStoreEVLRecipe(*S, NewAddr, EVL, NewMask);
})
.Case<VPReductionRecipe>([&](VPReductionRecipe *Red) {
VPValue *NewMask = GetNewMask(Red->getCondOp());
@@ -2183,7 +2199,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
IsaPred<VPVectorEndPointerRecipe, VPScalarIVStepsRecipe,
VPWidenIntOrFpInductionRecipe>) &&
"User of VF that we can't transform to EVL.");
- Plan.getVF().replaceAllUsesWith(&EVL);
+ Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
+ return isa<VPWidenIntOrFpInductionRecipe, VPScalarIVStepsRecipe>(U);
+ });
assert(all_of(Plan.getVFxUF().users(),
[&Plan](VPUser *U) {
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index c5d2739d0c087..8be9a72d61659 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -134,3 +134,144 @@ for.cond.cleanup:
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+
+define void @interleave_reverse(ptr noalias %a, ptr noalias %b) {
+; IF-EVL-LABEL: @interleave_reverse(
+; IF-EVL-NEXT: entry:
+; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; IF-EVL: vector.ph:
+; IF-EVL-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 4
+; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP23]], 1
+; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1023, [[TMP2]]
+; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP23]]
+; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; IF-EVL-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4
+; IF-EVL-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; IF-EVL-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], splat (i64 -1)
+; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> splat (i64 1023), [[TMP6]]
+; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]]
+; IF-EVL: vector.body:
+; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ 1023, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY1]] ]
+; IF-EVL-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
+; IF-EVL-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 -1, [[TMP8]]
+; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; IF-EVL-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP11]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER]]
+; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; IF-EVL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[TMP15:%.*]] = mul i64 0, [[TMP14]]
+; IF-EVL-NEXT: [[TMP16:%.*]] = sub i64 [[TMP14]], 1
+; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
+; IF-EVL-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP15]]
+; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP17]]
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE]], ptr align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP7]])
+; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP7]] to i64
+; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
+; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]]
+; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 1023
+; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL: middle.block:
+; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
+; IF-EVL: scalar.ph:
+; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ]
+; IF-EVL-NEXT: br label [[FOR_BODY:%.*]]
+; IF-EVL: for.body:
+; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ 1023, [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV1]], i32 0
+; IF-EVL-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; IF-EVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV1]], i32 1
+; IF-EVL-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; IF-EVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]]
+; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; IF-EVL-NEXT: [[IV_NEXT1]] = add nsw i64 [[IV1]], -1
+; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], 0
+; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; IF-EVL: for.cond.cleanup:
+; IF-EVL-NEXT: ret void
+;
+; NO-VP-LABEL: @interleave_reverse(
+; NO-VP-NEXT: entry:
+; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP: vector.ph:
+; NO-VP-NEXT: br label [[FOR_BODY1:%.*]]
+; NO-VP: vector.body:
+; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1023, i64 1022, i64 1021, i64 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY1]] ]
+; NO-VP-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; NO-VP-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
+; NO-VP-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <4 x i64> [[VEC_IND]], i32 0
+; NO-VP-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[STEP_ADD]], i32 0
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP12]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP13]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[VEC_IND]], i32 1
+; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <4 x i64> [[STEP_ADD]], i32 1
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; NO-VP-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison)
+; NO-VP-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER2]], [[WIDE_MASKED_GATHER]]
+; NO-VP-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER3]], [[WIDE_MASKED_GATHER1]]
+; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; NO-VP-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 -3
+; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -4
+; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -3
+; NO-VP-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-VP-NEXT: store <4 x i32> [[REVERSE]], ptr [[TMP8]], align 4
+; NO-VP-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; NO-VP-NEXT: store <4 x i32> [[REVERSE4]], ptr [[TMP10]], align 4
+; NO-VP-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-VP-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; NO-VP-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1016
+; NO-VP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-VP: middle.block:
+; NO-VP-NEXT: br label [[SCALAR_PH]]
+; NO-VP: scalar.ph:
+; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 7, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
+; NO-VP-NEXT: br label [[FOR_BODY:%.*]]
+; NO-VP: for.body:
+; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 0
+; NO-VP-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], i64 [[IV]], i32 1
+; NO-VP-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-VP: for.cond.cleanup:
+; NO-VP-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 1023, %entry ], [ %iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 0
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds [2 x i32], ptr %b, i64 %iv, i32 1
+ %1 = load i32, ptr %arrayidx2
+ %add = add nsw i32 %1, %0
+ %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv
+ store i32 %add, ptr %arrayidx4
+ %iv.next = add nsw i64 %iv, -1
+ %exitcond.not = icmp eq i64 %iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
|
…ze/endpointer-evl-fix
auto *EndPtr = dyn_cast<VPVectorEndPointerRecipe>(Addr); | ||
if (!EndPtr) | ||
return Addr; | ||
assert(EndPtr->getOperand(1) == &EndPtr->getParent()->getPlan()->getVF()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Assertion message
I’ve been thinking about it, and the real reason for replacing VPVectorEndPointerRecipe addr, VF with ..., EVL seems to come from the fact that the mask is reversed. The vp intrinsics can’t represent computing EVL lanes starting from the last lane backward, which is what drives the need for this series of transformations. What do you think? |
Yes, that's my understanding as well.
I've added an assert for this in c2bb610. I think alongside #146525, this means we can start to split the transform up into more specific ones like:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. It is a nice step toward separating correctness from optimization logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
This is the first step in untangling the variable step transform and header mask optimizations as described in #152541.
Currently we replace all VF users globally in the plan, including VPVectorEndPointerRecipe. However this leaves reversed loads and stores in an incorrect state until they are adjusted in optimizeMaskToEVL.
This moves the VPVectorEndPointerRecipe transform so that it is updated in lockstep with the actual load/store recipe.
One thought that crossed my mind was that VPInterleaveRecipe could also use VPVectorEndPointerRecipe, in which case we would have also been computing the wrong address because we don't transform it to an EVL recipe which accounts for the reversed address. However I've added a test and it looks like we don't support reversed interleave groups on RISC-V.