Skip to content

Commit 50dfc9e

Browse files
committed
[LoopLoadElimination] Add support for stride equal to -1
This patch allows us to gain all the benefits provided by LoopLoadElimination pass to descending loops. Differential Revision: https://reviews.llvm.org/D151448
1 parent a26bd95 commit 50dfc9e

File tree

2 files changed

+48
-8
lines changed

2 files changed

+48
-8
lines changed

llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp

+17-8
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,9 @@ struct StoreToLoadForwardingCandidate {
8888
StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
8989
: Load(Load), Store(Store) {}
9090

91-
/// Return true if the dependence from the store to the load has a
92-
/// distance of one. E.g. A[i+1] = A[i]
91+
/// Return true if the dependence from the store to the load has an
92+
/// absolute distance of one.
93+
/// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
9394
bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
9495
Loop *L) const {
9596
Value *LoadPtr = Load->getPointerOperand();
@@ -103,11 +104,19 @@ struct StoreToLoadForwardingCandidate {
103104
DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
104105
"Should be a known dependence");
105106

106-
// Currently we only support accesses with unit stride. FIXME: we should be
107-
// able to handle non unit stirde as well as long as the stride is equal to
108-
// the dependence distance.
109-
if (getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0) != 1 ||
110-
getPtrStride(PSE, LoadType, StorePtr, L).value_or(0) != 1)
107+
int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
108+
int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
109+
if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
110+
return false;
111+
112+
// TODO: This check for stride values other than 1 and -1 can be eliminated.
113+
// However, doing so may cause the LoopAccessAnalysis to overcompensate,
114+
// generating numerous non-wrap runtime checks that may undermine the
115+
// benefits of load elimination. To safely implement support for non-unit
116+
// strides, we would need to ensure either that the processed case does not
117+
// require these additional checks, or improve the LAA to handle them more
118+
// efficiently, or potentially both.
119+
if (std::abs(StrideLoad) != 1)
111120
return false;
112121

113122
unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
@@ -120,7 +129,7 @@ struct StoreToLoadForwardingCandidate {
120129
auto *Dist = cast<SCEVConstant>(
121130
PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
122131
const APInt &Val = Dist->getAPInt();
123-
return Val == TypeByteSize;
132+
return Val == TypeByteSize * StrideLoad;
124133
}
125134

126135
Value *getLoadPtr() const { return Load->getPointerOperand(); }

llvm/test/Transforms/LoopLoadElim/backward.ll

+31
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,34 @@ for.body: ; preds = %for.body, %entry
3030
for.end: ; preds = %for.body
3131
ret void
3232
}
33+
34+
; Same but loop is descending.
35+
;
36+
; for (unsigned i = N; i > 0; i--)
37+
; A[i-1] = A[i] + B[i];
38+
define void @g(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %N) {
39+
entry:
40+
; CHECK: %0 = shl i64 %N, 2
41+
; CHECK: %scevgep = getelementptr i8, ptr %A, i64 %0
42+
; CHECK: %load_initial = load i32, ptr %scevgep, align 4
43+
br label %for.body
44+
45+
for.body: ; preds = %for.body, %entry
46+
; CHECK: %store_forwarded = phi i32 [ %load_initial, %entry ], [ %add, %for.body ]
47+
%i.09 = phi i64 [ %sub, %for.body ], [ %N, %entry ]
48+
%arrayidx = getelementptr inbounds i32, ptr %A, i64 %i.09
49+
%load = load i32, ptr %arrayidx, align 4
50+
%arrayidx1 = getelementptr inbounds i32, ptr %B, i64 %i.09
51+
%load_1 = load i32, ptr %arrayidx1, align 4
52+
; CHECK: %add = add i32 %load_1, %store_forwarded
53+
%add = add i32 %load_1, %load
54+
%sub = add i64 %i.09, -1
55+
%arrayidx2 = getelementptr inbounds i32, ptr %A, i64 %sub
56+
store i32 %add, ptr %arrayidx2, align 4
57+
%cmp.not = icmp eq i64 %sub, 0
58+
br i1 %cmp.not, label %for.end, label %for.body
59+
60+
for.end: ; preds = %for.body
61+
ret void
62+
}
63+

0 commit comments

Comments
 (0)