Skip to content

[SLP]Add support for commutative intrinsics. #86316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,7 @@ static bool isCommutative(Instruction *I) {
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
return BO->isCommutative();
// TODO: This should check for generic Instruction::isCommutative(), but
// we need to confirm that the caller code correctly handles Intrinsics
// for example (does not have 2 operands).
return false;
return I->isCommutative();
}

/// \returns inserting index of InsertElement or InsertValue instruction,
Expand Down Expand Up @@ -1975,6 +1972,9 @@ class BoUpSLP {
"Expected same number of lanes");
assert(isa<Instruction>(VL[0]) && "Expected instruction");
unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
constexpr unsigned IntrinsicNumOperands = 2;
if (auto *CI = dyn_cast<IntrinsicInst>(VL[0]))
NumOperands = IntrinsicNumOperands;
OpsVec.resize(NumOperands);
unsigned NumLanes = VL.size();
for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
Expand Down Expand Up @@ -3420,10 +3420,11 @@ class BoUpSLP {
// immediates do not affect scheduler behavior this is considered
// okay.
auto *In = BundleMember->Inst;
assert(In &&
(isa<ExtractValueInst, ExtractElementInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
assert(
In &&
(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
In->getNumOperands() == TE->getNumOperands()) &&
"Missed TreeEntry operands?");
(void)In; // fake use to avoid build failure when assertions disabled

for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
Expand Down Expand Up @@ -6798,6 +6799,33 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,

TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
// Sort operands of the instructions so that each side is more likely to
// have the same opcode.
if (isCommutative(VL0)) {
ValueList Left, Right;
reorderInputsAccordingToOpcode(VL, Left, Right, *this);
TE->setOperand(0, Left);
TE->setOperand(1, Right);
SmallVector<ValueList> Operands;
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
Operands.emplace_back();
if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
continue;
for (Value *V : VL) {
auto *CI2 = cast<CallInst>(V);
Operands.back().push_back(CI2->getArgOperand(I));
}
TE->setOperand(I, Operands.back());
}
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
if (Operands[I - 2].empty())
continue;
buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
}
return;
}
TE->setOperandsInOrder();
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
// For scalar operands no need to create an entry since no need to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,17 +142,16 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
; CHECK-LABEL: define void @gather_2(
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float 0.000000e+00, float 0.000000e+00)
; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float [[TMP0]], float 0.000000e+00)
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP6]], <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = fmul float [[TMP3]], 0.000000e+00
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP4]], 0.000000e+00
; CHECK-NEXT: [[ARRAYIDX163:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1
; CHECK-NEXT: [[ARRAYIDX2_I_I_I278:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 1
; CHECK-NEXT: [[ARRAYIDX5_I_I_I280:%.*]] = getelementptr [4 x [4 x float]], ptr [[MAT1]], i64 0, i64 1, i64 2
; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX163]], align 4
; CHECK-NEXT: store float [[TMP6]], ptr [[ARRAYIDX2_I_I_I278]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[ARRAYIDX163]], align 4
; CHECK-NEXT: store float [[TMP7]], ptr [[ARRAYIDX5_I_I_I280]], align 4
; CHECK-NEXT: ret void
;
Expand Down Expand Up @@ -358,12 +357,12 @@ define void @reuse_shuffle_indices_cost_crash_2(ptr %bezt, float %0) {
; CHECK-NEXT: [[FNEG:%.*]] = fmul float [[TMP0]], 0.000000e+00
; CHECK-NEXT: [[TMP1:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
; CHECK-NEXT: store float [[TMP1]], ptr [[BEZT]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[FNEG]], float 0.000000e+00)
; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr float, ptr [[BEZT]], i64 1
; CHECK-NEXT: store float [[TMP2]], ptr [[ARRAYIDX5_I]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = tail call float @llvm.fmuladd.f32(float [[FNEG]], float 0.000000e+00, float 0.000000e+00)
; CHECK-NEXT: [[ARRAYIDX8_I831:%.*]] = getelementptr float, ptr [[BEZT]], i64 2
; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX8_I831]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[FNEG]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP2]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[ARRAYIDX5_I]], align 4
; CHECK-NEXT: ret void
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1295,7 +1295,7 @@ define i8 @umin_intrinsic_rdx_v16i8(ptr %p0) {

define void @PR49730() {
; CHECK-LABEL: @PR49730(
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 1, i32 1>)
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 2, i32 undef, i32 1>)
; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> undef, [[TMP1]]
; CHECK-NEXT: [[T12:%.*]] = sub nsw i32 undef, undef
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP2]])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ define void @test() {
; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
; CHECK: bb2:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ define void @gather_2(ptr %mat1, float %0, float %1) {
; CHECK-SAME: ptr [[MAT1:%.*]], float [[TMP0:%.*]], float [[TMP1:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP1]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x float> zeroinitializer)
; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float 0.000000e+00)
; CHECK-NEXT: [[TMP7:%.*]] = fmul float [[TMP6]], 0.000000e+00
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ declare i32 @llvm.umin.i32(i32, i32)
define void @test2() {
; CHECK-LABEL: @test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>)
; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> <i32 2, i32 undef, i32 1, i32 undef>, <4 x i32> <i32 undef, i32 3, i32 undef, i32 0>)
; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <4 x i32> undef, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 77)
Expand Down