Skip to content

Commit af28c9c

Browse files
committed
[SLP]Do not reorder split node operand with reuses, if not possible
Need to check if the operand node of the split vectorize node has reuses and check if it is possible to build the order for this node to reorder it correctly. Fixes #135912
1 parent d3153ad commit af28c9c

File tree

2 files changed

+153
-2
lines changed

2 files changed

+153
-2
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7479,8 +7479,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
74797479
for (const auto &P : Data.first->CombinedEntriesWithIndices) {
74807480
TreeEntry &OpTE = *VectorizableTree[P.first].get();
74817481
OrdersType Order = OpTE.ReorderIndices;
7482-
if (Order.empty()) {
7483-
if (!OpTE.isGather())
7482+
if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
7483+
if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
74847484
continue;
74857485
const auto BestOrder =
74867486
getReorderingData(OpTE, /*TopToBottom=*/false, IgnoreReorder);
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
3+
4+
define void @test(ptr %p) {
5+
; CHECK-LABEL: define void @test(
6+
; CHECK-SAME: ptr [[P:%.*]]) {
7+
; CHECK-NEXT: [[ENTRY:.*]]:
8+
; CHECK-NEXT: [[ARRAYIDX7_US_I_841:%.*]] = getelementptr i8, ptr [[P]], i64 36
9+
; CHECK-NEXT: [[ARRAYIDX7_US_I_1261:%.*]] = getelementptr i8, ptr [[P]], i64 52
10+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4
11+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_1261]], align 4
12+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
13+
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <4 x i32> [[TMP2]], i64 4)
14+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
15+
; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX7_US_I_841]], align 4
16+
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
17+
; CHECK-NEXT: [[TMP7:%.*]] = call <12 x i32> @llvm.vector.insert.v12i32.v4i32(<12 x i32> [[TMP6]], <4 x i32> [[TMP5]], i64 8)
18+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 poison, i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
19+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
20+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> [[TMP9]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
21+
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 6
22+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
23+
; CHECK-NEXT: [[TMP13:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]]
24+
; CHECK-NEXT: [[TMP14:%.*]] = srem <16 x i32> [[TMP13]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
25+
; CHECK-NEXT: [[TMP15:%.*]] = or <12 x i32> [[TMP7]], zeroinitializer
26+
; CHECK-NEXT: [[TMP16:%.*]] = srem <12 x i32> [[TMP15]], <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
27+
; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I:.*]]
28+
; CHECK: [[FOR_COND1_PREHEADER_US_I]]:
29+
; CHECK-NEXT: [[A_PROMOTED253537_US_I:%.*]] = phi i32 [ [[OP_RDX8:%.*]], %[[FOR_COND1_PREHEADER_US_I]] ], [ 0, %[[ENTRY]] ]
30+
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP14]])
31+
; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> [[TMP16]])
32+
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP18]], [[TMP17]]
33+
; CHECK-NEXT: [[OP_RDX8]] = add i32 [[OP_RDX]], 0
34+
; CHECK-NEXT: br label %[[FOR_COND1_PREHEADER_US_I]]
35+
;
36+
entry:
37+
%arrayidx7.us.i.841 = getelementptr i8, ptr %p, i64 36
38+
%arrayidx7.us.i.946 = getelementptr i8, ptr %p, i64 40
39+
%arrayidx7.us.i.1051 = getelementptr i8, ptr %p, i64 44
40+
%arrayidx7.us.i.1156 = getelementptr i8, ptr %p, i64 48
41+
%arrayidx7.us.i.1261 = getelementptr i8, ptr %p, i64 52
42+
%arrayidx7.us.i.1366 = getelementptr i8, ptr %p, i64 56
43+
%arrayidx7.us.i.1471 = getelementptr i8, ptr %p, i64 60
44+
%arrayidx7.us.i.1576 = getelementptr i8, ptr %p, i64 64
45+
%add8.us.i.1.4 = add i32 0, 0
46+
%rem.us.i.1.4 = srem i32 %add8.us.i.1.4, 1
47+
%add8.us.i.1.5 = add i32 0, 0
48+
%rem.us.i.1.5 = srem i32 %add8.us.i.1.5, 1
49+
%invariant.op91 = add i32 %rem.us.i.1.4, %rem.us.i.1.5
50+
%add8.us.i.1.6 = add i32 0, 0
51+
%rem.us.i.1.6 = srem i32 %add8.us.i.1.6, 1
52+
%invariant.op92 = add i32 %invariant.op91, %rem.us.i.1.6
53+
%0 = load i32, ptr %arrayidx7.us.i.841, align 4
54+
%1 = load i32, ptr %arrayidx7.us.i.946, align 4
55+
%2 = load i32, ptr %arrayidx7.us.i.1051, align 4
56+
%3 = load i32, ptr %arrayidx7.us.i.1156, align 4
57+
%4 = load i32, ptr %arrayidx7.us.i.1261, align 4
58+
%5 = load i32, ptr %arrayidx7.us.i.1366, align 4
59+
%add8.us.i.7.6 = or i32 %5, 0
60+
%rem.us.i.7.6 = srem i32 %add8.us.i.7.6, 1
61+
%6 = load i32, ptr %arrayidx7.us.i.1471, align 4
62+
%add8.us.i.7.7 = or i32 %6, 0
63+
%rem.us.i.7.7 = srem i32 %add8.us.i.7.7, 1
64+
%invariant.op165 = add i32 %rem.us.i.7.6, %rem.us.i.7.7
65+
%7 = load i32, ptr %arrayidx7.us.i.1576, align 4
66+
%add8.us.i.7.8 = or i32 %7, 0
67+
%rem.us.i.7.8 = srem i32 %add8.us.i.7.8, 1
68+
%invariant.op166 = add i32 %invariant.op165, %rem.us.i.7.8
69+
%add8.us.i.8 = or i32 %0, 0
70+
%rem.us.i.8 = srem i32 %add8.us.i.8, 1
71+
%invariant.op167 = add i32 %invariant.op166, %rem.us.i.8
72+
%add8.us.i.8.1 = or i32 %1, 0
73+
%rem.us.i.8.1 = srem i32 %add8.us.i.8.1, 1
74+
%invariant.op168 = add i32 %invariant.op167, %rem.us.i.8.1
75+
%add8.us.i.8.2 = or i32 %2, 0
76+
%rem.us.i.8.2 = srem i32 %add8.us.i.8.2, 1
77+
%invariant.op169 = add i32 %invariant.op168, %rem.us.i.8.2
78+
%add8.us.i.8.3 = or i32 %3, 0
79+
%rem.us.i.8.3 = srem i32 %add8.us.i.8.3, 1
80+
%invariant.op170 = add i32 %invariant.op169, %rem.us.i.8.3
81+
%add8.us.i.8.4 = or i32 %4, 0
82+
%rem.us.i.8.4 = srem i32 %add8.us.i.8.4, 1
83+
%invariant.op171 = add i32 %invariant.op170, %rem.us.i.8.4
84+
%add8.us.i.8.5 = or i32 %5, 0
85+
%rem.us.i.8.5 = srem i32 %add8.us.i.8.5, 1
86+
%invariant.op172 = add i32 %invariant.op171, %rem.us.i.8.5
87+
%add8.us.i.8.6 = or i32 %6, 0
88+
%rem.us.i.8.6 = srem i32 %add8.us.i.8.6, 0
89+
%invariant.op173 = add i32 %invariant.op172, %rem.us.i.8.6
90+
%add8.us.i.8.7 = or i32 %7, 0
91+
%rem.us.i.8.7 = srem i32 %add8.us.i.8.7, 0
92+
%invariant.op174 = add i32 %invariant.op173, %rem.us.i.8.7
93+
%invariant.op181 = add i32 %invariant.op174, 0
94+
%invariant.op182 = add i32 %invariant.op181, 0
95+
%invariant.op183 = add i32 %invariant.op182, 0
96+
%invariant.op184 = add i32 %invariant.op183, 0
97+
%invariant.op185 = add i32 %invariant.op184, 0
98+
%invariant.op186 = add i32 %invariant.op185, 0
99+
%invariant.op187 = add i32 %invariant.op186, 0
100+
%invariant.op188 = add i32 %invariant.op187, 0
101+
%add8.us.i.11.1 = or i32 %4, 0
102+
%rem.us.i.11.1 = srem i32 %add8.us.i.11.1, 1
103+
%invariant.op189 = add i32 %invariant.op188, %rem.us.i.11.1
104+
%add8.us.i.11.2 = add i32 0, 0
105+
%rem.us.i.11.2 = srem i32 %add8.us.i.11.2, 1
106+
%invariant.op190 = add i32 %invariant.op189, %rem.us.i.11.2
107+
%add8.us.i.11.3 = add i32 %6, %2
108+
%rem.us.i.11.3 = srem i32 %add8.us.i.11.3, 1
109+
%invariant.op191 = add i32 %invariant.op190, %rem.us.i.11.3
110+
%add8.us.i.11.4 = add i32 %7, %2
111+
%rem.us.i.11.4 = srem i32 %add8.us.i.11.4, 1
112+
%invariant.op192 = add i32 %invariant.op191, %rem.us.i.11.4
113+
%8 = load i32, ptr %p, align 4
114+
%add8.us.i.12 = add i32 %4, %8
115+
%rem.us.i.12 = srem i32 %add8.us.i.12, 1
116+
%invariant.op193 = add i32 %invariant.op192, %rem.us.i.12
117+
%add8.us.i.12.1 = add i32 %5, %8
118+
%rem.us.i.12.1 = srem i32 %add8.us.i.12.1, 1
119+
%invariant.op194 = add i32 %invariant.op193, %rem.us.i.12.1
120+
%add8.us.i.12.2 = add i32 0, 0
121+
%rem.us.i.12.2 = srem i32 %add8.us.i.12.2, 1
122+
%invariant.op195 = add i32 %invariant.op194, %rem.us.i.12.2
123+
%add8.us.i.12.3 = add i32 0, 0
124+
%rem.us.i.12.3 = srem i32 %add8.us.i.12.3, 1
125+
%invariant.op196 = add i32 %invariant.op195, %rem.us.i.12.3
126+
%add8.us.i.13 = add i32 0, 0
127+
%rem.us.i.13 = srem i32 %add8.us.i.13, 0
128+
%invariant.op197 = add i32 %invariant.op196, %rem.us.i.13
129+
%add8.us.i.13.1 = add i32 0, 0
130+
%rem.us.i.13.1 = srem i32 %add8.us.i.13.1, 0
131+
%invariant.op198 = add i32 %invariant.op197, %rem.us.i.13.1
132+
%add8.us.i.13.2 = add i32 0, 0
133+
%rem.us.i.13.2 = srem i32 %add8.us.i.13.2, 1
134+
%invariant.op199 = add i32 %invariant.op198, %rem.us.i.13.2
135+
%add8.us.i.14 = add i32 0, 0
136+
%rem.us.i.14 = srem i32 %add8.us.i.14, 1
137+
%invariant.op200 = add i32 %invariant.op199, %rem.us.i.14
138+
%add8.us.i.14.1 = add i32 0, 0
139+
%rem.us.i.14.1 = srem i32 %add8.us.i.14.1, 1
140+
%invariant.op201 = add i32 %invariant.op200, %rem.us.i.14.1
141+
%add8.us.i.15 = add i32 0, 0
142+
%rem.us.i.15 = srem i32 %add8.us.i.15, 1
143+
%invariant.op202 = add i32 %invariant.op201, %rem.us.i.15
144+
br label %for.cond1.preheader.us.i
145+
146+
for.cond1.preheader.us.i:
147+
%a.promoted253537.us.i = phi i32 [ %add9.us.i.15.reass, %for.cond1.preheader.us.i ], [ 0, %entry ]
148+
%add9.us.i.15.reass = add i32 %invariant.op92, %invariant.op202
149+
br label %for.cond1.preheader.us.i
150+
}
151+

0 commit comments

Comments
 (0)