Skip to content

Commit d578840

Browse files
[SLP]Add support for commutative intrinsics.
Implemented long-standing TODO to support commutative intrinsics. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #86316
1 parent 5bbce06 commit d578840

File tree

6 files changed

+54
-27
lines changed

6 files changed

+54
-27
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 36 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -306,10 +306,7 @@ static bool isCommutative(Instruction *I) {
306306
return Cmp->isCommutative();
307307
if (auto *BO = dyn_cast<BinaryOperator>(I))
308308
return BO->isCommutative();
309-
// TODO: This should check for generic Instruction::isCommutative(), but
310-
// we need to confirm that the caller code correctly handles Intrinsics
311-
// for example (does not have 2 operands).
312-
return false;
309+
return I->isCommutative();
313310
}
314311

315312
/// \returns inserting index of InsertElement or InsertValue instruction,
@@ -1975,6 +1972,9 @@ class BoUpSLP {
19751972
"Expected same number of lanes");
19761973
assert(isa<Instruction>(VL[0]) && "Expected instruction");
19771974
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
1975+
constexpr unsigned IntrinsicNumOperands = 2;
1976+
if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
1977+
NumOperands = IntrinsicNumOperands;
19781978
OpsVec.resize(NumOperands);
19791979
unsigned NumLanes = VL.size();
19801980
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
@@ -3420,10 +3420,11 @@ class BoUpSLP {
34203420
// immediates do not affect scheduler behavior this is considered
34213421
// okay.
34223422
auto *In = BundleMember->Inst;
3423-
assert(In &&
3424-
(isa<ExtractValueInst, ExtractElementInst>(In) ||
3425-
In->getNumOperands() == TE->getNumOperands()) &&
3426-
"Missed TreeEntry operands?");
3423+
assert(
3424+
In &&
3425+
(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3426+
In->getNumOperands() == TE->getNumOperands()) &&
3427+
"Missed TreeEntry operands?");
34273428
(void)In; // fake use to avoid build failure when assertions disabled
34283429

34293430
for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
@@ -6798,6 +6799,33 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
67986799

67996800
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
68006801
ReuseShuffleIndicies);
6802+
// Sort operands of the instructions so that each side is more likely to
6803+
// have the same opcode.
6804+
if (isCommutative(VL0)) {
6805+
ValueList Left, Right;
6806+
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
6807+
TE->setOperand(0, Left);
6808+
TE->setOperand(1, Right);
6809+
SmallVector<ValueList> Operands;
6810+
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
6811+
Operands.emplace_back();
6812+
if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
6813+
continue;
6814+
for (Value *V : VL) {
6815+
auto *CI2 = cast<CallInst>(V);
6816+
Operands.back().push_back(CI2->getArgOperand(I));
6817+
}
6818+
TE->setOperand(I, Operands.back());
6819+
}
6820+
buildTree_rec(Left, Depth + 1, {TE, 0});
6821+
buildTree_rec(Right, Depth + 1, {TE, 1});
6822+
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
6823+
if (Operands[I - 2].empty())
6824+
continue;
6825+
buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
6826+
}
6827+
return;
6828+
}
68016829
TE->setOperandsInOrder();
68026830
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
68036831
// For scalar operands no need to create an entry since no need to

llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
142142
; CHECK-LABEL: define void @gather_2(
143143
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
144144
; CHECK-NEXT: entry:
145-
; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
146-
; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
145+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
146+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
147+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
148+
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
147149
; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
148-
; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
149-
; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
150150
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
151151
; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
152-
; CHECK-NEXT: [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
153152
; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
154-
; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
155-
; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
153+
; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
154+
; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
156155
; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
157156
; CHECK-NEXT: ret void
158157
;
@@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
358357
; CHECK-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
359358
; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
360359
; CHECK-NEXT: store float [[TMP1]], ptr [[BEZT]], align 4
361-
; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
362360
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
363-
; CHECK-NEXT: store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
364-
; CHECK-NEXT: [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
365-
; CHECK-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
366-
; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
361+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
362+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
363+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
364+
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
365+
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
367366
; CHECK-NEXT: ret void
368367
;
369368
entry:

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {
12951295

12961296
define void @PR49730() {
12971297
; CHECK-LABEL: @PR49730(
1298-
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
1298+
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
12991299
; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
13001300
; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
13011301
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])

llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ define void @test() {
1212
; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
1313
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
1414
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4
15-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
16-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
15+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
16+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
1717
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
18-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
18+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
1919
; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
2020
; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
2121
; CHECK: bb2:

llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
142142
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
143143
; CHECK-NEXT: entry:
144144
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
145-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
146-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
145+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
146+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
147147
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
148148
; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
149149
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00

llvm/test/Transforms/SLPVectorizer/slp-umax-rdx-matcher-crash.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
4343
define void @test2() {
4444
; CHECK-LABEL: @test2(
4545
; CHECK-NEXT: entry:
46-
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
46+
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
4747
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
4848
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
4949
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)

0 commit comments

Comments
 (0)