@@ -1803,8 +1803,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
1803
1803
// / }
1804
1804
static bool isSafeDependenceDistance (const DataLayout &DL, ScalarEvolution &SE,
1805
1805
const SCEV &MaxBTC, const SCEV &Dist,
1806
- uint64_t MaxStride,
1807
- uint64_t TypeByteSize) {
1806
+ uint64_t MaxStride) {
1808
1807
1809
1808
// If we can prove that
1810
1809
// (**) |Dist| > MaxBTC * Step
@@ -1823,8 +1822,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
1823
1822
// will be executed only if LoopCount >= VF, proving distance >= LoopCount
1824
1823
// also guarantees that distance >= VF.
1825
1824
//
1826
- const uint64_t ByteStride = MaxStride * TypeByteSize;
1827
- const SCEV *Step = SE.getConstant (MaxBTC.getType (), ByteStride);
1825
+ const SCEV *Step = SE.getConstant (MaxBTC.getType (), MaxStride);
1828
1826
const SCEV *Product = SE.getMulExpr (&MaxBTC, Step);
1829
1827
1830
1828
const SCEV *CastedDist = &Dist;
@@ -1868,9 +1866,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1868
1866
if (Distance % TypeByteSize)
1869
1867
return false ;
1870
1868
1871
- uint64_t ScaledDist = Distance / TypeByteSize;
1872
-
1873
- // No dependence if the scaled distance is not multiple of the stride.
1869
+ // No dependence if the distance is not multiple of the stride.
1874
1870
// E.g.
1875
1871
// for (i = 0; i < 1024 ; i += 4)
1876
1872
// A[i+2] = A[i] + 1;
@@ -1886,7 +1882,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
1886
1882
// Two accesses in memory (scaled distance is 4, stride is 3):
1887
1883
// | A[0] | | | A[3] | | | A[6] | | |
1888
1884
// | | | | | A[4] | | | A[7] | |
1889
- return ScaledDist % Stride;
1885
+ return Distance % Stride;
1890
1886
}
1891
1887
1892
1888
std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1925,6 +1921,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
1925
1921
if (StrideAPtr && *StrideAPtr < 0 ) {
1926
1922
std::swap (Src, Sink);
1927
1923
std::swap (AInst, BInst);
1924
+ std::swap (ATy, BTy);
1928
1925
std::swap (StrideAPtr, StrideBPtr);
1929
1926
}
1930
1927
@@ -1976,31 +1973,68 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
1976
1973
return MemoryDepChecker::Dependence::IndirectUnsafe;
1977
1974
}
1978
1975
1979
- int64_t StrideAPtrInt = *StrideAPtr;
1980
- int64_t StrideBPtrInt = *StrideBPtr;
1981
- LLVM_DEBUG (dbgs () << " LAA: Src induction step: " << StrideAPtrInt
1982
- << " Sink induction step: " << StrideBPtrInt << " \n " );
1976
+ LLVM_DEBUG (dbgs () << " LAA: Src induction step: " << *StrideAPtr
1977
+ << " Sink induction step: " << *StrideBPtr << " \n " );
1978
+
1979
+ // Note that store size is different from alloc size, which is dependent on
1980
+ // store size. We use the former for checking illegal cases, and the latter
1981
+ // for scaling strides.
1982
+ TypeSize AStoreSz = DL.getTypeStoreSize (ATy),
1983
+ BStoreSz = DL.getTypeStoreSize (BTy);
1984
+
1985
+ // When the distance is zero, we're reading/writing the same memory location:
1986
+ // check that the store sizes are equal. Otherwise, fail with an unknown
1987
+ // dependence for which we should not generate runtime checks.
1988
+ if (Dist->isZero () && AStoreSz != BStoreSz)
1989
+ return MemoryDepChecker::Dependence::Unknown;
1990
+
1991
+ // We can't get get a uint64_t for the AllocSize if either of the store sizes
1992
+ // are scalable.
1993
+ if (AStoreSz.isScalable () || BStoreSz.isScalable ())
1994
+ return MemoryDepChecker::Dependence::Unknown;
1995
+
1996
+ // The TypeByteSize is used to scale Distance and VF. In these contexts, the
1997
+ // only size that matters is the size of the Sink.
1998
+ uint64_t ASz = alignTo (AStoreSz, DL.getABITypeAlign (ATy).value ()),
1999
+ TypeByteSize = alignTo (BStoreSz, DL.getABITypeAlign (BTy).value ());
2000
+
2001
+ // We scale the strides by the alloc-type-sizes, so we can check that the
2002
+ // common distance is equal when ASz != BSz.
2003
+ int64_t StrideAScaled = *StrideAPtr * ASz;
2004
+ int64_t StrideBScaled = *StrideBPtr * TypeByteSize;
2005
+
1983
2006
// At least Src or Sink are loop invariant and the other is strided or
1984
2007
// invariant. We can generate a runtime check to disambiguate the accesses.
1985
- if (StrideAPtrInt == 0 || StrideBPtrInt == 0 )
2008
+ if (!StrideAScaled || !StrideBScaled )
1986
2009
return MemoryDepChecker::Dependence::Unknown;
1987
2010
1988
2011
// Both Src and Sink have a constant stride, check if they are in the same
1989
2012
// direction.
1990
- if ((StrideAPtrInt > 0 && StrideBPtrInt < 0 ) ||
1991
- (StrideAPtrInt < 0 && StrideBPtrInt > 0 )) {
2013
+ if (StrideAScaled > 0 != StrideBScaled > 0 ) {
1992
2014
LLVM_DEBUG (
1993
2015
dbgs () << " Pointer access with strides in different directions\n " );
1994
2016
return MemoryDepChecker::Dependence::Unknown;
1995
2017
}
1996
2018
1997
- uint64_t TypeByteSize = DL.getTypeAllocSize (ATy);
1998
- bool HasSameSize =
1999
- DL.getTypeStoreSizeInBits (ATy) == DL.getTypeStoreSizeInBits (BTy);
2000
- if (!HasSameSize)
2001
- TypeByteSize = 0 ;
2002
- return DepDistanceStrideAndSizeInfo (Dist, std::abs (StrideAPtrInt),
2003
- std::abs (StrideBPtrInt), TypeByteSize,
2019
+ StrideAScaled = std::abs (StrideAScaled);
2020
+ StrideBScaled = std::abs (StrideBScaled);
2021
+
2022
+ // MaxStride is the max of the scaled strides, as expected.
2023
+ uint64_t MaxStride = std::max (StrideAScaled, StrideBScaled);
2024
+
2025
+ // CommonStride is set if both scaled strides are equal.
2026
+ std::optional<uint64_t > CommonStride;
2027
+ if (StrideAScaled == StrideBScaled)
2028
+ CommonStride = StrideAScaled;
2029
+
2030
+ // TODO: Historically, we don't retry with runtime checks unless the unscaled
2031
+ // strides are the same, but this doesn't make sense. Fix this once the
2032
+ // condition for runtime checks in isDependent is fixed.
2033
+ bool ShouldRetryWithRuntimeCheck =
2034
+ std::abs (*StrideAPtr) == std::abs (*StrideBPtr);
2035
+
2036
+ return DepDistanceStrideAndSizeInfo (Dist, MaxStride, CommonStride,
2037
+ ShouldRetryWithRuntimeCheck, TypeByteSize,
2004
2038
AIsWrite, BIsWrite);
2005
2039
}
2006
2040
@@ -2016,47 +2050,40 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2016
2050
if (std::holds_alternative<Dependence::DepType>(Res))
2017
2051
return std::get<Dependence::DepType>(Res);
2018
2052
2019
- auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
2053
+ auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
2054
+ TypeByteSize, AIsWrite, BIsWrite] =
2020
2055
std::get<DepDistanceStrideAndSizeInfo>(Res);
2021
- bool HasSameSize = TypeByteSize > 0 ;
2022
2056
2023
- std::optional<uint64_t > CommonStride =
2024
- StrideA == StrideB ? std::make_optional (StrideA) : std::nullopt;
2025
2057
if (isa<SCEVCouldNotCompute>(Dist)) {
2026
- // TODO: Relax requirement that there is a common stride to retry with
2027
- // non-constant distance dependencies.
2028
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2058
+ // TODO: Relax requirement that there is a common unscaled stride to retry
2059
+ // with non-constant distance dependencies.
2060
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2029
2061
LLVM_DEBUG (dbgs () << " LAA: Dependence because of uncomputable distance.\n " );
2030
2062
return Dependence::Unknown;
2031
2063
}
2032
2064
2033
2065
ScalarEvolution &SE = *PSE.getSE ();
2034
2066
auto &DL = InnermostLoop->getHeader ()->getDataLayout ();
2035
- uint64_t MaxStride = std::max (StrideA, StrideB);
2036
2067
2037
2068
// If the distance between the acecsses is larger than their maximum absolute
2038
2069
// stride multiplied by the symbolic maximum backedge taken count (which is an
2039
2070
// upper bound of the number of iterations), the accesses are independet, i.e.
2040
2071
// they are far enough appart that accesses won't access the same location
2041
2072
// across all loop ierations.
2042
- if (HasSameSize && isSafeDependenceDistance (
2043
- DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount ()),
2044
- *Dist, MaxStride, TypeByteSize))
2073
+ if (isSafeDependenceDistance (
2074
+ DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount ()), *Dist, MaxStride))
2045
2075
return Dependence::NoDep;
2046
2076
2047
- const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
2077
+ const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
2048
2078
2049
2079
// Attempt to prove strided accesses independent.
2050
- if (C) {
2051
- const APInt &Val = C->getAPInt ();
2052
- int64_t Distance = Val.getSExtValue ();
2080
+ if (ConstDist) {
2081
+ int64_t Distance = std::abs (ConstDist->getAPInt ().getSExtValue ());
2053
2082
2054
2083
// If the distance between accesses and their strides are known constants,
2055
2084
// check whether the accesses interlace each other.
2056
- if (std::abs (Distance) > 0 && CommonStride && *CommonStride > 1 &&
2057
- HasSameSize &&
2058
- areStridedAccessesIndependent (std::abs (Distance), *CommonStride,
2059
- TypeByteSize)) {
2085
+ if (Distance > 0 && CommonStride && CommonStride > 1 &&
2086
+ areStridedAccessesIndependent (Distance, *CommonStride, TypeByteSize)) {
2060
2087
LLVM_DEBUG (dbgs () << " LAA: Strided accesses are independent\n " );
2061
2088
return Dependence::NoDep;
2062
2089
}
@@ -2069,15 +2096,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2069
2096
2070
2097
// Negative distances are not plausible dependencies.
2071
2098
if (SE.isKnownNonPositive (Dist)) {
2072
- if (SE.isKnownNonNegative (Dist)) {
2073
- if (HasSameSize) {
2074
- // Write to the same location with the same size.
2075
- return Dependence::Forward;
2076
- }
2077
- LLVM_DEBUG (dbgs () << " LAA: possibly zero dependence difference but "
2078
- " different type sizes\n " );
2079
- return Dependence::Unknown;
2080
- }
2099
+ if (SE.isKnownNonNegative (Dist))
2100
+ // Write to the same location.
2101
+ return Dependence::Forward;
2081
2102
2082
2103
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
2083
2104
// Check if the first access writes to a location that is read in a later
@@ -2089,17 +2110,16 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2089
2110
// forward dependency will allow vectorization using any width.
2090
2111
2091
2112
if (IsTrueDataDependence && EnableForwardingConflictDetection) {
2092
- if (!C ) {
2113
+ if (!ConstDist ) {
2093
2114
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2094
2115
// condition to consider retrying with runtime checks. Historically, we
2095
- // did not set it when strides were different but there is no inherent
2096
- // reason to.
2097
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2116
+ // did not set it when unscaled strides were different but there is no
2117
+ // inherent reason to.
2118
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2098
2119
return Dependence::Unknown;
2099
2120
}
2100
- if (!HasSameSize ||
2101
- couldPreventStoreLoadForward (C->getAPInt ().abs ().getZExtValue (),
2102
- TypeByteSize)) {
2121
+ if (couldPreventStoreLoadForward (
2122
+ ConstDist->getAPInt ().abs ().getZExtValue (), TypeByteSize)) {
2103
2123
LLVM_DEBUG (
2104
2124
dbgs () << " LAA: Forward but may prevent st->ld forwarding\n " );
2105
2125
return Dependence::ForwardButPreventsForwarding;
@@ -2113,27 +2133,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2113
2133
int64_t MinDistance = SE.getSignedRangeMin (Dist).getSExtValue ();
2114
2134
// Below we only handle strictly positive distances.
2115
2135
if (MinDistance <= 0 ) {
2116
- FoundNonConstantDistanceDependence |= CommonStride. has_value () ;
2136
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck ;
2117
2137
return Dependence::Unknown;
2118
2138
}
2119
2139
2120
- if (!isa<SCEVConstant>(Dist)) {
2140
+ if (!ConstDist)
2121
2141
// Previously this case would be treated as Unknown, possibly setting
2122
2142
// FoundNonConstantDistanceDependence to force re-trying with runtime
2123
2143
// checks. Until the TODO below is addressed, set it here to preserve
2124
2144
// original behavior w.r.t. re-trying with runtime checks.
2125
2145
// TODO: FoundNonConstantDistanceDependence is used as a necessary
2126
2146
// condition to consider retrying with runtime checks. Historically, we
2127
- // did not set it when strides were different but there is no inherent
2128
- // reason to.
2129
- FoundNonConstantDistanceDependence |= CommonStride.has_value ();
2130
- }
2131
-
2132
- if (!HasSameSize) {
2133
- LLVM_DEBUG (dbgs () << " LAA: ReadWrite-Write positive dependency with "
2134
- " different type sizes\n " );
2135
- return Dependence::Unknown;
2136
- }
2147
+ // did not set it when unscaled strides were different but there is no
2148
+ // inherent reason to.
2149
+ FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
2137
2150
2138
2151
if (!CommonStride)
2139
2152
return Dependence::Unknown;
@@ -2148,8 +2161,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2148
2161
2149
2162
// It's not vectorizable if the distance is smaller than the minimum distance
2150
2163
// needed for a vectroized/unrolled version. Vectorizing one iteration in
2151
- // front needs TypeByteSize * Stride . Vectorizing the last iteration needs
2152
- // TypeByteSize (No need to plus the last gap distance).
2164
+ // front needs CommonStride . Vectorizing the last iteration needs TypeByteSize
2165
+ // (No need to plus the last gap distance).
2153
2166
//
2154
2167
// E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
2155
2168
// foo(int *A) {
@@ -2176,10 +2189,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2176
2189
// We know that Dist is positive, but it may not be constant. Use the signed
2177
2190
// minimum for computations below, as this ensures we compute the closest
2178
2191
// possible dependence distance.
2179
- uint64_t MinDistanceNeeded =
2180
- TypeByteSize * *CommonStride * (MinNumIter - 1 ) + TypeByteSize;
2192
+ uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1 ) + TypeByteSize;
2181
2193
if (MinDistanceNeeded > static_cast <uint64_t >(MinDistance)) {
2182
- if (!isa<SCEVConstant>(Dist) ) {
2194
+ if (!ConstDist ) {
2183
2195
// For non-constant distances, we checked the lower bound of the
2184
2196
// dependence distance and the distance may be larger at runtime (and safe
2185
2197
// for vectorization). Classify it as Unknown, so we re-try with runtime
@@ -2234,12 +2246,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
2234
2246
2235
2247
// An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
2236
2248
// since there is a backwards dependency.
2237
- uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * * CommonStride) ;
2249
+ uint64_t MaxVF = MinDepDistBytes / * CommonStride;
2238
2250
LLVM_DEBUG (dbgs () << " LAA: Positive min distance " << MinDistance
2239
2251
<< " with max VF = " << MaxVF << ' \n ' );
2240
2252
2241
2253
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8 ;
2242
- if (!isa<SCEVConstant>(Dist) && MaxVFInBits < MaxTargetVectorWidthInBits) {
2254
+ if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) {
2243
2255
// For non-constant distances, we checked the lower bound of the dependence
2244
2256
// distance and the distance may be larger at runtime (and safe for
2245
2257
// vectorization). Classify it as Unknown, so we re-try with runtime checks.
0 commit comments