Skip to content

Commit b765fdd

Browse files
[SLP]Try to keep scalars, used in phi nodes, if phi nodes from same block are vectorized.
Before doing the vectorization of the PHI nodes, the compiler sorts them by the opcodes of the operands. If the scalar is replaced during the vectorization by extractelement, it breaks this sorting and prevent some further vectorization attempts. Patch tries to improve this by doing extra analysis of the scalars and tries to keep them, if it is found that this scalar is used in other (external) PHI node in the same block. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #103923
1 parent e31252b commit b765fdd

File tree

2 files changed

+51
-29
lines changed

2 files changed

+51
-29
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10930,8 +10930,31 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1093010930
if (CanBeUsedAsScalar) {
1093110931
InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
1093210932
bool KeepScalar = ScalarCost <= ExtraCost;
10933-
if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10934-
ExtraCost - ScalarCost <= TTI::TCC_Basic) {
10933+
// Try to keep original scalar if the user is the phi node from the same
10934+
// block as the root phis, currently vectorized. It allows to keep
10935+
// better ordering info of PHIs, being vectorized currently.
10936+
bool IsProfitablePHIUser =
10937+
(KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
10938+
VectorizableTree.front()->Scalars.size() > 2)) &&
10939+
VectorizableTree.front()->getOpcode() == Instruction::PHI &&
10940+
!Inst->hasNUsesOrMore(UsesLimit) &&
10941+
none_of(Inst->users(),
10942+
[&](User *U) {
10943+
auto *PHIUser = dyn_cast<PHINode>(U);
10944+
return (!PHIUser ||
10945+
PHIUser->getParent() !=
10946+
cast<Instruction>(
10947+
VectorizableTree.front()->getMainOp())
10948+
->getParent()) &&
10949+
!getTreeEntry(U);
10950+
}) &&
10951+
count_if(Entry->Scalars, [&](Value *V) {
10952+
return ValueToExtUses->contains(V);
10953+
}) <= 2;
10954+
if (IsProfitablePHIUser) {
10955+
KeepScalar = true;
10956+
} else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
10957+
ExtraCost - ScalarCost <= TTI::TCC_Basic) {
1093510958
unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
1093610959
return ValueToExtUses->contains(V);
1093710960
});

llvm/test/Transforms/SLPVectorizer/X86/phi.ll

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -136,42 +136,41 @@ for.end: ; preds = %for.body
136136
define float @foo3(ptr nocapture readonly %A) #0 {
137137
; CHECK-LABEL: @foo3(
138138
; CHECK-NEXT: entry:
139-
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4
140-
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
139+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
140+
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
141141
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
142-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 0>
143-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0
142+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
144143
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
145144
; CHECK: for.body:
146145
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
147-
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
148-
; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
149-
; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ]
150-
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
151-
; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00
146+
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
147+
; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
148+
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]
149+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
150+
; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
152151
; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]]
153-
; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
154-
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]]
155-
; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
152+
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
153+
; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
154+
; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
156155
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
157156
; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
158-
; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
159-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> <i32 1, i32 poison, i32 2, i32 3>
160-
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1
161-
; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
162-
; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]]
163-
; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
164-
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121
157+
; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
158+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> <i32 1, i32 poison, i32 2, i32 3>
159+
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1
160+
; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
161+
; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]]
162+
; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
163+
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121
165164
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
166165
; CHECK: for.end:
167-
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0
168-
; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]]
169-
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1
170-
; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]]
171-
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2
172-
; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]]
173-
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3
174-
; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]]
166+
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0
167+
; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]]
168+
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1
169+
; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]]
170+
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2
171+
; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]]
172+
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3
173+
; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]]
175174
; CHECK-NEXT: ret float [[ADD31]]
176175
;
177176
entry:

0 commit comments

Comments
 (0)