Skip to content

Commit 7dca2c6

Browse files
[SLP]Gather scalarized calls
If the calls won't be vectorized, but will be scalarized after vectorization, they should be build as buildvector nodes, not vector nodes. Vectorization of such calls leads to incorrect cost estimation, does not allow to calculate correctly spills costs. Reviewers: lukel97, preames Reviewed By: preames Pull Request: #125070
1 parent 66ce716 commit 7dca2c6

14 files changed

+1189
-613
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7659,32 +7659,38 @@ buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
76597659
}
76607660

76617661
/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
7662-
/// function (if possible) calls.
7662+
/// function (if possible) calls. Returns invalid cost for the corresponding
7663+
/// calls, if they cannot be vectorized/will be scalarized.
76637664
static std::pair<InstructionCost, InstructionCost>
76647665
getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
76657666
TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
76667667
ArrayRef<Type *> ArgTys) {
7667-
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7668-
7669-
// Calculate the cost of the scalar and vector calls.
7670-
FastMathFlags FMF;
7671-
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7672-
FMF = FPCI->getFastMathFlags();
7673-
IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
7674-
auto IntrinsicCost =
7675-
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
7676-
76777668
auto Shape = VFShape::get(CI->getFunctionType(),
76787669
ElementCount::getFixed(VecTy->getNumElements()),
76797670
false /*HasGlobalPred*/);
76807671
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7681-
auto LibCost = IntrinsicCost;
7672+
auto LibCost = InstructionCost::getInvalid();
76827673
if (!CI->isNoBuiltin() && VecFunc) {
76837674
// Calculate the cost of the vector library call.
76847675
// If the corresponding vector call is cheaper, return its cost.
76857676
LibCost =
76867677
TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
76877678
}
7679+
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7680+
7681+
// Calculate the cost of the vector intrinsic call.
7682+
FastMathFlags FMF;
7683+
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7684+
FMF = FPCI->getFastMathFlags();
7685+
const InstructionCost ScalarLimit = 10000;
7686+
IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
7687+
LibCost.isValid() ? LibCost : ScalarLimit);
7688+
auto IntrinsicCost =
7689+
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
7690+
if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
7691+
(!LibCost.isValid() && IntrinsicCost > ScalarLimit))
7692+
IntrinsicCost = InstructionCost::getInvalid();
7693+
76887694
return {IntrinsicCost, LibCost};
76897695
}
76907696

@@ -8028,6 +8034,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
80288034
return TreeEntry::NeedToGather;
80298035
}
80308036
}
8037+
SmallVector<Type *> ArgTys =
8038+
buildIntrinsicArgTypes(CI, ID, VL.size(), 0, TTI);
8039+
auto *VecTy = getWidenedType(S.getMainOp()->getType(), VL.size());
8040+
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
8041+
if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
8042+
return TreeEntry::NeedToGather;
80318043

80328044
return TreeEntry::Vectorize;
80338045
}

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

Lines changed: 130 additions & 86 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

Lines changed: 130 additions & 86 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44
target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
55
target triple = "nvptx--nvidiacl"
66

7-
; Test that CTLZ can be vectorized currently even though the second argument is a scalar
8-
7+
; Vector versions of the intrinsics are scalarized, so keep them scalar
98
define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
109
; CHECK-LABEL: define <2 x i8> @cltz_test(
1110
; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
1211
; CHECK-NEXT: [[ENTRY:.*:]]
13-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X]], i1 false)
12+
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
13+
; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
14+
; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> zeroinitializer, i8 [[CALL_I]], i32 0
15+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
16+
; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
17+
; CHECK-NEXT: [[VEC:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
1418
; CHECK-NEXT: ret <2 x i8> [[VEC]]
1519
;
1620
entry:
@@ -28,7 +32,12 @@ define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 {
2832
; CHECK-LABEL: define <2 x i8> @cltz_test_poison(
2933
; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0]] {
3034
; CHECK-NEXT: [[ENTRY:.*:]]
31-
; CHECK-NEXT: [[VEC:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X]], i1 false)
35+
; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
36+
; CHECK-NEXT: [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
37+
; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i8> poison, i8 [[CALL_I]], i32 0
38+
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i8> [[X]], i32 1
39+
; CHECK-NEXT: [[CALL_I4:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP1]], i1 false)
40+
; CHECK-NEXT: [[VEC:%.*]] = insertelement <2 x i8> [[VECINIT]], i8 [[CALL_I4]], i32 1
3241
; CHECK-NEXT: ret <2 x i8> [[VEC]]
3342
;
3443
entry:

0 commit comments

Comments
 (0)