Skip to content

Commit 019c2e1

Browse files
committed
LAA: generalize strides over unequal type sizes
getDepdenceDistanceStrideAndSize currently returns a non-zero TypeByteSize only if the type-sizes of the source and sink are equal. The non-zero TypeByteSize is then used by isDependent to scale the strides, the distance between the accesses, and the VF. This restriction is very artificial, as the stride sizes can be scaled by the respective type-sizes in advance, freeing isDependent of this responsibility, and removing the ugly special-case of zero-TypeByteSize. The patch also has the side-effect of fixing the long-standing TODO of requesting runtime-checks when the strides are unequal. The test impact of this patch is that several false depdendencies are eliminated, and several unknown depdendencies now come with runtime-checks instead.
1 parent bdf241c commit 019c2e1

File tree

6 files changed

+143
-127
lines changed

6 files changed

+143
-127
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -366,16 +366,20 @@ class MemoryDepChecker {
366366

367367
struct DepDistanceStrideAndSizeInfo {
368368
const SCEV *Dist;
369-
uint64_t StrideA;
370-
uint64_t StrideB;
369+
uint64_t MaxStride;
370+
std::optional<uint64_t> CommonStride;
371+
bool ShouldRetryWithRuntimeCheck;
371372
uint64_t TypeByteSize;
372373
bool AIsWrite;
373374
bool BIsWrite;
374375

375-
DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
376-
uint64_t StrideB, uint64_t TypeByteSize,
377-
bool AIsWrite, bool BIsWrite)
378-
: Dist(Dist), StrideA(StrideA), StrideB(StrideB),
376+
DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t MaxStride,
377+
std::optional<uint64_t> CommonStride,
378+
bool ShouldRetryWithRuntimeCheck,
379+
uint64_t TypeByteSize, bool AIsWrite,
380+
bool BIsWrite)
381+
: Dist(Dist), MaxStride(MaxStride), CommonStride(CommonStride),
382+
ShouldRetryWithRuntimeCheck(ShouldRetryWithRuntimeCheck),
379383
TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
380384
};
381385

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 88 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1803,8 +1803,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
18031803
/// }
18041804
static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18051805
const SCEV &MaxBTC, const SCEV &Dist,
1806-
uint64_t MaxStride,
1807-
uint64_t TypeByteSize) {
1806+
uint64_t MaxStride) {
18081807

18091808
// If we can prove that
18101809
// (**) |Dist| > MaxBTC * Step
@@ -1823,8 +1822,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
18231822
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
18241823
// also guarantees that distance >= VF.
18251824
//
1826-
const uint64_t ByteStride = MaxStride * TypeByteSize;
1827-
const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
1825+
const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
18281826
const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
18291827

18301828
const SCEV *CastedDist = &Dist;
@@ -1868,9 +1866,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18681866
if (Distance % TypeByteSize)
18691867
return false;
18701868

1871-
uint64_t ScaledDist = Distance / TypeByteSize;
1872-
1873-
// No dependence if the scaled distance is not multiple of the stride.
1869+
// No dependence if the distance is not multiple of the stride.
18741870
// E.g.
18751871
// for (i = 0; i < 1024 ; i += 4)
18761872
// A[i+2] = A[i] + 1;
@@ -1886,7 +1882,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
18861882
// Two accesses in memory (scaled distance is 4, stride is 3):
18871883
// | A[0] | | | A[3] | | | A[6] | | |
18881884
// | | | | | A[4] | | | A[7] | |
1889-
return ScaledDist % Stride;
1885+
return Distance % Stride;
18901886
}
18911887

18921888
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1925,6 +1921,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19251921
if (StrideAPtr && *StrideAPtr < 0) {
19261922
std::swap(Src, Sink);
19271923
std::swap(AInst, BInst);
1924+
std::swap(ATy, BTy);
19281925
std::swap(StrideAPtr, StrideBPtr);
19291926
}
19301927

@@ -1976,31 +1973,68 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
19761973
return MemoryDepChecker::Dependence::IndirectUnsafe;
19771974
}
19781975

1979-
int64_t StrideAPtrInt = *StrideAPtr;
1980-
int64_t StrideBPtrInt = *StrideBPtr;
1981-
LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
1982-
<< " Sink induction step: " << StrideBPtrInt << "\n");
1976+
LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << *StrideAPtr
1977+
<< " Sink induction step: " << *StrideBPtr << "\n");
1978+
1979+
// Note that store size is different from alloc size, which is dependent on
1980+
// store size. We use the former for checking illegal cases, and the latter
1981+
// for scaling strides.
1982+
TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
1983+
BStoreSz = DL.getTypeStoreSize(BTy);
1984+
1985+
// When the distance is zero, we're reading/writing the same memory location:
1986+
// check that the store sizes are equal. Otherwise, fail with an unknown
1987+
// dependence for which we should not generate runtime checks.
1988+
if (Dist->isZero() && AStoreSz != BStoreSz)
1989+
return MemoryDepChecker::Dependence::Unknown;
1990+
1991+
// We can't get get a uint64_t for the AllocSize if either of the store sizes
1992+
// are scalable.
1993+
if (AStoreSz.isScalable() || BStoreSz.isScalable())
1994+
return MemoryDepChecker::Dependence::Unknown;
1995+
1996+
// The TypeByteSize is used to scale Distance and VF. In these contexts, the
1997+
// only size that matters is the size of the Sink.
1998+
uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy).value()),
1999+
TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value());
2000+
2001+
// We scale the strides by the alloc-type-sizes, so we can check that the
2002+
// common distance is equal when ASz != BSz.
2003+
int64_t StrideAScaled = *StrideAPtr * ASz;
2004+
int64_t StrideBScaled = *StrideBPtr * TypeByteSize;
2005+
19832006
// At least Src or Sink are loop invariant and the other is strided or
19842007
// invariant. We can generate a runtime check to disambiguate the accesses.
1985-
if (StrideAPtrInt == 0 || StrideBPtrInt == 0)
2008+
if (!StrideAScaled || !StrideBScaled)
19862009
return MemoryDepChecker::Dependence::Unknown;
19872010

19882011
// Both Src and Sink have a constant stride, check if they are in the same
19892012
// direction.
1990-
if ((StrideAPtrInt > 0 && StrideBPtrInt < 0) ||
1991-
(StrideAPtrInt < 0 && StrideBPtrInt > 0)) {
2013+
if (StrideAScaled > 0 != StrideBScaled > 0) {
19922014
LLVM_DEBUG(
19932015
dbgs() << "Pointer access with strides in different directions\n");
19942016
return MemoryDepChecker::Dependence::Unknown;
19952017
}
19962018

1997-
uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
1998-
bool HasSameSize =
1999-
DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
2000-
if (!HasSameSize)
2001-
TypeByteSize = 0;
2002-
return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
2003-
std::abs(StrideBPtrInt), TypeByteSize,
2019+
StrideAScaled = std::abs(StrideAScaled);
2020+
StrideBScaled = std::abs(StrideBScaled);
2021+
2022+
// MaxStride is the max of the scaled strides, as expected.
2023+
uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
2024+
2025+
// CommonStride is set if both scaled strides are equal.
2026+
std::optional<uint64_t> CommonStride;
2027+
if (StrideAScaled == StrideBScaled)
2028+
CommonStride = StrideAScaled;
2029+
2030+
// TODO: Historically, we don't retry with runtime checks unless the unscaled
2031+
// strides are the same, but this doesn't make sense. Fix this once the
2032+
// condition for runtime checks in isDependent is fixed.
2033+
bool ShouldRetryWithRuntimeCheck =
2034+
std::abs(*StrideAPtr) == std::abs(*StrideBPtr);
2035+
2036+
return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
2037+
ShouldRetryWithRuntimeCheck, TypeByteSize,
20042038
AIsWrite, BIsWrite);
20052039
}
20062040

@@ -2016,47 +2050,40 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20162050
if (std::holds_alternative<Dependence::DepType>(Res))
20172051
return std::get<Dependence::DepType>(Res);
20182052

2019-
auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
2053+
auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
2054+
TypeByteSize, AIsWrite, BIsWrite] =
20202055
std::get<DepDistanceStrideAndSizeInfo>(Res);
2021-
bool HasSameSize = TypeByteSize > 0;
20222056

2023-
std::optional<uint64_t> CommonStride =
2024-
StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
20252057
if (isa<SCEVCouldNotCompute>(Dist)) {
2026-
// TODO: Relax requirement that there is a common stride to retry with
2027-
// non-constant distance dependencies.
2028-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2058+
// TODO: Relax requirement that there is a common unscaled stride to retry
2059+
// with non-constant distance dependencies.
2060+
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
20292061
LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
20302062
return Dependence::Unknown;
20312063
}
20322064

20332065
ScalarEvolution &SE = *PSE.getSE();
20342066
auto &DL = InnermostLoop->getHeader()->getDataLayout();
2035-
uint64_t MaxStride = std::max(StrideA, StrideB);
20362067

20372068
// If the distance between the acecsses is larger than their maximum absolute
20382069
// stride multiplied by the symbolic maximum backedge taken count (which is an
20392070
// upper bound of the number of iterations), the accesses are independet, i.e.
20402071
// they are far enough appart that accesses won't access the same location
20412072
// across all loop ierations.
2042-
if (HasSameSize && isSafeDependenceDistance(
2043-
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
2044-
*Dist, MaxStride, TypeByteSize))
2073+
if (isSafeDependenceDistance(
2074+
DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
20452075
return Dependence::NoDep;
20462076

2047-
const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2077+
const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
20482078

20492079
// Attempt to prove strided accesses independent.
2050-
if (C) {
2051-
const APInt &Val = C->getAPInt();
2052-
int64_t Distance = Val.getSExtValue();
2080+
if (ConstDist) {
2081+
int64_t Distance = std::abs(ConstDist->getAPInt().getSExtValue());
20532082

20542083
// If the distance between accesses and their strides are known constants,
20552084
// check whether the accesses interlace each other.
2056-
if (std::abs(Distance) > 0 && CommonStride && *CommonStride > 1 &&
2057-
HasSameSize &&
2058-
areStridedAccessesIndependent(std::abs(Distance), *CommonStride,
2059-
TypeByteSize)) {
2085+
if (Distance > 0 && CommonStride && CommonStride > 1 &&
2086+
areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
20602087
LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
20612088
return Dependence::NoDep;
20622089
}
@@ -2069,15 +2096,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20692096

20702097
// Negative distances are not plausible dependencies.
20712098
if (SE.isKnownNonPositive(Dist)) {
2072-
if (SE.isKnownNonNegative(Dist)) {
2073-
if (HasSameSize) {
2074-
// Write to the same location with the same size.
2075-
return Dependence::Forward;
2076-
}
2077-
LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
2078-
"different type sizes\n");
2079-
return Dependence::Unknown;
2080-
}
2099+
if (SE.isKnownNonNegative(Dist))
2100+
// Write to the same location.
2101+
return Dependence::Forward;
20812102

20822103
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
20832104
// Check if the first access writes to a location that is read in a later
@@ -2089,17 +2110,16 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
20892110
// forward dependency will allow vectorization using any width.
20902111

20912112
if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2092-
if (!C) {
2113+
if (!ConstDist) {
20932114
// TODO: FoundNonConstantDistanceDependence is used as a necessary
20942115
// condition to consider retrying with runtime checks. Historically, we
2095-
// did not set it when strides were different but there is no inherent
2096-
// reason to.
2097-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2116+
// did not set it when unscaled strides were different but there is no
2117+
// inherent reason to.
2118+
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
20982119
return Dependence::Unknown;
20992120
}
2100-
if (!HasSameSize ||
2101-
couldPreventStoreLoadForward(C->getAPInt().abs().getZExtValue(),
2102-
TypeByteSize)) {
2121+
if (couldPreventStoreLoadForward(
2122+
ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) {
21032123
LLVM_DEBUG(
21042124
dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
21052125
return Dependence::ForwardButPreventsForwarding;
@@ -2113,27 +2133,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21132133
int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
21142134
// Below we only handle strictly positive distances.
21152135
if (MinDistance <= 0) {
2116-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2136+
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
21172137
return Dependence::Unknown;
21182138
}
21192139

2120-
if (!isa<SCEVConstant>(Dist)) {
2140+
if (!ConstDist)
21212141
// Previously this case would be treated as Unknown, possibly setting
21222142
// FoundNonConstantDistanceDependence to force re-trying with runtime
21232143
// checks. Until the TODO below is addressed, set it here to preserve
21242144
// original behavior w.r.t. re-trying with runtime checks.
21252145
// TODO: FoundNonConstantDistanceDependence is used as a necessary
21262146
// condition to consider retrying with runtime checks. Historically, we
2127-
// did not set it when strides were different but there is no inherent
2128-
// reason to.
2129-
FoundNonConstantDistanceDependence |= CommonStride.has_value();
2130-
}
2131-
2132-
if (!HasSameSize) {
2133-
LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
2134-
"different type sizes\n");
2135-
return Dependence::Unknown;
2136-
}
2147+
// did not set it when unscaled strides were different but there is no
2148+
// inherent reason to.
2149+
FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
21372150

21382151
if (!CommonStride)
21392152
return Dependence::Unknown;
@@ -2148,8 +2161,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21482161

21492162
// It's not vectorizable if the distance is smaller than the minimum distance
21502163
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2151-
// front needs TypeByteSize * Stride. Vectorizing the last iteration needs
2152-
// TypeByteSize (No need to plus the last gap distance).
2164+
// front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
2165+
// (No need to plus the last gap distance).
21532166
//
21542167
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
21552168
// foo(int *A) {
@@ -2176,10 +2189,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
21762189
// We know that Dist is positive, but it may not be constant. Use the signed
21772190
// minimum for computations below, as this ensures we compute the closest
21782191
// possible dependence distance.
2179-
uint64_t MinDistanceNeeded =
2180-
TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
2192+
uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
21812193
if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
2182-
if (!isa<SCEVConstant>(Dist)) {
2194+
if (!ConstDist) {
21832195
// For non-constant distances, we checked the lower bound of the
21842196
// dependence distance and the distance may be larger at runtime (and safe
21852197
// for vectorization). Classify it as Unknown, so we re-try with runtime
@@ -2234,12 +2246,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
22342246

22352247
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
22362248
// since there is a backwards dependency.
2237-
uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
2249+
uint64_t MaxVF = MinDepDistBytes / *CommonStride;
22382250
LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
22392251
<< " with max VF = " << MaxVF << '\n');
22402252

22412253
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
2242-
if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
2254+
if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
22432255
// For non-constant distances, we checked the lower bound of the dependence
22442256
// distance and the distance may be larger at runtime (and safe for
22452257
// vectorization). Classify it as Unknown, so we re-try with runtime checks.

llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
130130
; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
131131
; CHECK-NEXT: loop:
132132
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
133-
; CHECK-NEXT: Unknown data dependence.
133+
; CHECK-NEXT: Backward loop carried data dependence that prevents store-to-load forwarding.
134134
; CHECK-NEXT: Dependences:
135-
; CHECK-NEXT: Unknown:
136-
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
137-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
138-
; CHECK-EMPTY:
139-
; CHECK-NEXT: Unknown:
140-
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
141-
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
142-
; CHECK-EMPTY:
143135
; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
144136
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
145137
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8

llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
7070
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
7171
; CHECK-NEXT: %l = load i24, ptr %gep.1, align 1
7272
; CHECK-EMPTY:
73-
; CHECK-NEXT: Forward:
74-
; CHECK-NEXT: store i32 0, ptr %gep.2, align 4 ->
75-
; CHECK-NEXT: store i24 %l, ptr %ptr.iv, align 1
76-
; CHECK-EMPTY:
7773
; CHECK-NEXT: Run-time memory checks:
7874
; CHECK-NEXT: Grouped accesses:
7975
; CHECK-EMPTY:

0 commit comments

Comments
 (0)