Skip to content

Commit b9702bb

Browse files
committed
[LV] Consider insts feeding interleave group pointers free.
For interleave groups, we only generate a pointer for the start of the interleave group (the instruction at the insert position). The other addresses for other members are alreayd considered free, but so are their operands, if they are only used in address computations for other interleave group members.
1 parent 04cd069 commit b9702bb

File tree

3 files changed

+34
-26
lines changed

3 files changed

+34
-26
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7048,6 +7048,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
70487048
// Ignore ephemeral values.
70497049
CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
70507050

7051+
SmallSetVector<Value *, 4> DeadInterleavePointerOps;
70517052
for (BasicBlock *BB : TheLoop->blocks())
70527053
for (Instruction &I : *BB) {
70537054
// Find all stores to invariant variables. Since they are going to sink
@@ -7058,25 +7059,32 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
70587059
ValuesToIgnore.insert(&I);
70597060

70607061
// For interleave groups, we only create a pointer for the start of the
7061-
// interleave group. Mark single-use ops feeding interleave group mem ops
7062-
// as free when vectorizing, expect the insert-pos memory op.
7062+
// interleave group. Queue up addresses of group members except the insert
7063+
// position for further processing.
70637064
if (isAccessInterleaved(&I)) {
70647065
auto *Group = getInterleavedAccessGroup(&I);
70657066
if (Group->getInsertPos() == &I)
70667067
continue;
70677068
Value *PointerOp = getLoadStorePointerOperand(&I);
7068-
SmallSetVector<Value *, 4> Worklist;
7069-
Worklist.insert(PointerOp);
7070-
for (unsigned I = 0; I != Worklist.size(); ++I) {
7071-
auto *Op = dyn_cast<Instruction>(Worklist[I]);
7072-
if (!Op || !TheLoop->contains(Op) || !Op->hasOneUse())
7073-
continue;
7074-
VecValuesToIgnore.insert(Op);
7075-
Worklist.insert(Op->op_begin(), Op->op_end());
7076-
}
7069+
DeadInterleavePointerOps.insert(PointerOp);
70777070
}
70787071
}
70797072

7073+
// Mark ops feeding interleave group members as free, if they are only used
7074+
// by other dead computations.
7075+
for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
7076+
auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);
7077+
if (!Op || !TheLoop->contains(Op) || any_of(Op->users(), [this](User *U) {
7078+
Instruction *UI = cast<Instruction>(U);
7079+
return !VecValuesToIgnore.contains(U) &&
7080+
(!isAccessInterleaved(UI) ||
7081+
getInterleavedAccessGroup(UI)->getInsertPos() == UI);
7082+
}))
7083+
continue;
7084+
VecValuesToIgnore.insert(Op);
7085+
DeadInterleavePointerOps.insert(Op->op_begin(), Op->op_end());
7086+
}
7087+
70807088
// Ignore type-promoting instructions we identified during reduction
70817089
// detection.
70827090
for (const auto &Reduction : Legal->getReductionVars()) {

llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,29 +86,29 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali
8686
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
8787
; CHECK-NEXT: [[TMP39:%.*]] = add i64 [[INDEX]], 0
8888
; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[P_INVAR]], align 4
89-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i64 0
90-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
89+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP40]], i64 0
90+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
9191
; CHECK-NEXT: [[TMP41:%.*]] = shl i64 [[TMP39]], 2
9292
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[P_INVAR]], align 4
93-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <4 x float> poison, float [[TMP42]], i64 0
94-
; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT33]], <4 x float> poison, <4 x i32> zeroinitializer
93+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <2 x float> poison, float [[TMP42]], i64 0
94+
; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT27]], <2 x float> poison, <2 x i32> zeroinitializer
9595
; CHECK-NEXT: [[TMP43:%.*]] = or disjoint i64 [[TMP41]], 3
9696
; CHECK-NEXT: [[TMP44:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[TMP43]]
9797
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr float, ptr [[TMP44]], i32 -3
98-
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT]], <4 x float> [[BROADCAST_SPLAT34]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
99-
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <8 x float> [[TMP46]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
100-
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x float> [[TMP47]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
101-
; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
98+
; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT]], <2 x float> [[BROADCAST_SPLAT28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
99+
; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <4 x float> [[TMP46]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
100+
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP47]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
101+
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP45]], align 4
102102
; CHECK-NEXT: [[TMP48:%.*]] = load float, ptr [[P_INVAR]], align 4
103-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x float> poison, float [[TMP48]], i64 0
104-
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT35]], <4 x float> poison, <4 x i32> zeroinitializer
103+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <2 x float> poison, float [[TMP48]], i64 0
104+
; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT29]], <2 x float> poison, <2 x i32> zeroinitializer
105105
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr float, ptr [[DST_2]], i64 [[TMP43]]
106106
; CHECK-NEXT: [[TMP50:%.*]] = getelementptr float, ptr [[TMP49]], i32 -3
107+
; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLAT30]], <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
107108
; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLAT36]], <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
108-
; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
109-
; CHECK-NEXT: [[INTERLEAVED_VEC37:%.*]] = shufflevector <16 x float> [[TMP52]], <16 x float> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
110-
; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC37]], ptr [[TMP50]], align 4
111-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
109+
; CHECK-NEXT: [[INTERLEAVED_VEC31:%.*]] = shufflevector <8 x float> [[TMP51]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
110+
; CHECK-NEXT: store <8 x float> [[INTERLEAVED_VEC31]], ptr [[TMP50]], align 4
111+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
112112
; CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
113113
; CHECK-NEXT: br i1 [[TMP53]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
114114
; CHECK: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/X86/pr47437.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
44
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=AVX1
55
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
6-
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE2
6+
; RUN: opt < %s -S -passes=loop-vectorize -mtriple=x86_64-- -mcpu=slm | FileCheck %s --check-prefix=SSE41
77

88
define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readonly %s1, ptr noalias nocapture readonly %s2, i32 %n) {
99
; SSE2-LABEL: @test_muladd(

0 commit comments

Comments
 (0)