Skip to content

Commit ab90d27

Browse files
authored
[llvm][ARM]Add widen global arrays pass (#107120)
- Pass optimizes memcpy's by padding out destinations and sources to a full word to make backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant array. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded. - Pass works within GlobalOpt but is disabled by default on all targets except ARM.
1 parent 2954d1f commit ab90d27

18 files changed

+589
-0
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

+11
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,10 @@ class TargetTransformInfo {
18191819
/// \return The maximum number of function arguments the target supports.
18201820
unsigned getMaxNumArgs() const;
18211821

1822+
/// \return For an array of given Size, return alignment boundary to
1823+
/// pad to. Default is no padding.
1824+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
1825+
18221826
/// @}
18231827

18241828
private:
@@ -2225,6 +2229,8 @@ class TargetTransformInfo::Concept {
22252229
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
22262230
virtual bool hasArmWideBranch(bool Thumb) const = 0;
22272231
virtual unsigned getMaxNumArgs() const = 0;
2232+
virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
2233+
Type *ArrayType) const = 0;
22282234
};
22292235

22302236
template <typename T>
@@ -3026,6 +3032,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
30263032
unsigned getMaxNumArgs() const override {
30273033
return Impl.getMaxNumArgs();
30283034
}
3035+
3036+
unsigned getNumBytesToPadGlobalArray(unsigned Size,
3037+
Type *ArrayType) const override {
3038+
return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
3039+
}
30293040
};
30303041

30313042
template <typename T>

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+4
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,10 @@ class TargetTransformInfoImplBase {
10061006

10071007
unsigned getMaxNumArgs() const { return UINT_MAX; }
10081008

1009+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
1010+
return 0;
1011+
}
1012+
10091013
protected:
10101014
// Obtain the minimum required size to hold the value (without the sign)
10111015
// In case of a vector it returns the min required size for one element.

llvm/lib/Analysis/TargetTransformInfo.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -1383,6 +1383,12 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
13831383
return TTIImpl->isVectorShiftByScalarCheap(Ty);
13841384
}
13851385

1386+
unsigned
1387+
TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
1388+
Type *ArrayType) const {
1389+
return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
1390+
}
1391+
13861392
TargetTransformInfo::Concept::~Concept() = default;
13871393

13881394
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

+33
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ static cl::opt<bool>
5656
AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
5757
cl::desc("Enable the generation of WLS loops"));
5858

59+
static cl::opt<bool> UseWidenGlobalArrays(
60+
"widen-global-strings", cl::Hidden, cl::init(true),
61+
cl::desc("Enable the widening of global strings to alignment boundaries"));
62+
5963
extern cl::opt<TailPredication::Mode> EnableTailPredication;
6064

6165
extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -2805,3 +2809,32 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
28052809
}
28062810
return true;
28072811
}
2812+
2813+
unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
2814+
Type *ArrayType) const {
2815+
if (!UseWidenGlobalArrays) {
2816+
LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2817+
return false;
2818+
}
2819+
2820+
// Don't modify none integer array types
2821+
if (!ArrayType || !ArrayType->isArrayTy() ||
2822+
!ArrayType->getArrayElementType()->isIntegerTy())
2823+
return 0;
2824+
2825+
// We pad to 4 byte boundaries
2826+
if (Size % 4 == 0)
2827+
return 0;
2828+
2829+
unsigned NumBytesToPad = 4 - (Size % 4);
2830+
unsigned NewSize = Size + NumBytesToPad;
2831+
2832+
// Max number of bytes that memcpy allows for lowering to load/stores before
2833+
// it uses library function (__aeabi_memcpy).
2834+
unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2835+
2836+
if (NewSize > MaxMemIntrinsicSize)
2837+
return 0;
2838+
2839+
return NumBytesToPad;
2840+
}

llvm/lib/Target/ARM/ARMTargetTransformInfo.h

+3
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
337337

338338
bool isProfitableToSinkOperands(Instruction *I,
339339
SmallVectorImpl<Use *> &Ops) const;
340+
341+
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
342+
340343
/// @}
341344
};
342345

llvm/lib/Transforms/IPO/GlobalOpt.cpp

+165
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
9292
STATISTIC(NumColdCC, "Number of functions marked coldcc");
9393
STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
9494
STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
95+
STATISTIC(NumGlobalArraysPadded,
96+
"Number of global arrays padded to alignment boundary");
9597

9698
static cl::opt<bool>
9799
EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2029,6 +2031,165 @@ OptimizeFunctions(Module &M,
20292031
return Changed;
20302032
}
20312033

2034+
static bool callInstIsMemcpy(CallInst *CI) {
2035+
if (!CI)
2036+
return false;
2037+
2038+
Function *F = CI->getCalledFunction();
2039+
if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
2040+
return false;
2041+
2042+
return true;
2043+
}
2044+
2045+
static bool destArrayCanBeWidened(CallInst *CI) {
2046+
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
2047+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2048+
2049+
if (!Alloca || !IsVolatile || IsVolatile->isOne())
2050+
return false;
2051+
2052+
if (!Alloca->isStaticAlloca())
2053+
return false;
2054+
2055+
if (!Alloca->getAllocatedType()->isArrayTy())
2056+
return false;
2057+
2058+
return true;
2059+
}
2060+
2061+
static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
2062+
unsigned NumBytesToPad,
2063+
unsigned NumBytesToCopy) {
2064+
if (!OldVar->hasInitializer())
2065+
return nullptr;
2066+
2067+
ConstantDataArray *DataArray =
2068+
dyn_cast<ConstantDataArray>(OldVar->getInitializer());
2069+
if (!DataArray)
2070+
return nullptr;
2071+
2072+
// Update to be word aligned (memcpy(...,X,...))
2073+
// create replacement with padded null bytes.
2074+
StringRef Data = DataArray->getRawDataValues();
2075+
std::vector<uint8_t> StrData(Data.begin(), Data.end());
2076+
for (unsigned int p = 0; p < NumBytesToPad; p++)
2077+
StrData.push_back('\0');
2078+
auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
2079+
// Create new padded version of global variable.
2080+
Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
2081+
GlobalVariable *NewGV = new GlobalVariable(
2082+
*(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
2083+
SourceReplace, SourceReplace->getName());
2084+
// Copy any other attributes from original global variable
2085+
// e.g. unamed_addr
2086+
NewGV->copyAttributesFrom(OldVar);
2087+
NewGV->takeName(OldVar);
2088+
return NewGV;
2089+
}
2090+
2091+
static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
2092+
const unsigned NumBytesToCopy,
2093+
ConstantDataArray *SourceDataArray) {
2094+
2095+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2096+
if (Alloca) {
2097+
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
2098+
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
2099+
unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
2100+
// Update destination array to be word aligned (memcpy(X,...,...))
2101+
IRBuilder<> BuildAlloca(Alloca);
2102+
AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
2103+
Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
2104+
NewAlloca->takeName(Alloca);
2105+
NewAlloca->setAlignment(Alloca->getAlign());
2106+
Alloca->replaceAllUsesWith(NewAlloca);
2107+
Alloca->eraseFromParent();
2108+
}
2109+
}
2110+
2111+
static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
2112+
const unsigned NumBytesToPad,
2113+
const unsigned NumBytesToCopy,
2114+
ConstantInt *BytesToCopyOp,
2115+
ConstantDataArray *SourceDataArray) {
2116+
auto *NewSourceGV =
2117+
widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
2118+
if (!NewSourceGV)
2119+
return false;
2120+
2121+
// Update arguments of remaining uses that
2122+
// are memcpys.
2123+
for (auto *User : SourceVar->users()) {
2124+
auto *CI = dyn_cast<CallInst>(User);
2125+
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
2126+
continue;
2127+
2128+
if (CI->getArgOperand(1) != SourceVar)
2129+
continue;
2130+
2131+
widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
2132+
2133+
CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
2134+
NumBytesToCopy + NumBytesToPad));
2135+
}
2136+
SourceVar->replaceAllUsesWith(NewSourceGV);
2137+
2138+
NumGlobalArraysPadded++;
2139+
return true;
2140+
}
2141+
2142+
static bool tryWidenGlobalArraysUsedByMemcpy(
2143+
GlobalVariable *GV,
2144+
function_ref<TargetTransformInfo &(Function &)> GetTTI) {
2145+
2146+
if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
2147+
!GV->hasGlobalUnnamedAddr())
2148+
return false;
2149+
2150+
for (auto *User : GV->users()) {
2151+
CallInst *CI = dyn_cast<CallInst>(User);
2152+
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
2153+
continue;
2154+
2155+
Function *F = CI->getCalledFunction();
2156+
2157+
auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
2158+
if (!BytesToCopyOp)
2159+
continue;
2160+
2161+
ConstantDataArray *SourceDataArray =
2162+
dyn_cast<ConstantDataArray>(GV->getInitializer());
2163+
if (!SourceDataArray)
2164+
continue;
2165+
2166+
unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
2167+
2168+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
2169+
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
2170+
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
2171+
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
2172+
// Calculate the number of elements to copy while avoiding floored
2173+
// division of integers returning wrong values i.e. copying one byte
2174+
// from an array of i16 would yield 0 elements to copy as supposed to 1.
2175+
unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
2176+
2177+
// For safety purposes lets add a constraint and only pad when
2178+
// NumElementsToCopy == destination array size ==
2179+
// source which is a constant
2180+
if (NumElementsToCopy != DZSize || DZSize != SZSize)
2181+
continue;
2182+
2183+
unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
2184+
NumBytesToCopy, SourceDataArray->getType());
2185+
if (NumBytesToPad) {
2186+
return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
2187+
BytesToCopyOp, SourceDataArray);
2188+
}
2189+
}
2190+
return false;
2191+
}
2192+
20322193
static bool
20332194
OptimizeGlobalVars(Module &M,
20342195
function_ref<TargetTransformInfo &(Function &)> GetTTI,
@@ -2058,6 +2219,10 @@ OptimizeGlobalVars(Module &M,
20582219
continue;
20592220
}
20602221

2222+
// For global variable arrays called in a memcpy
2223+
// we try to pad to nearest valid alignment boundary
2224+
Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);
2225+
20612226
Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
20622227
}
20632228
return Changed;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
5+
6+
define void @memcpy_struct() {
7+
; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
10+
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
11+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
12+
; CHECK-NEXT: ret void
13+
;
14+
entry:
15+
%something = alloca {i8, i8, i8}, align 1
16+
%call1 = call i32 @bar(ptr nonnull %something)
17+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
18+
ret void
19+
}
20+
21+
22+
@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1
23+
24+
define void @memcpy_array_multidimensional() {
25+
; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
26+
; CHECK-NEXT: [[ENTRY:.*:]]
27+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
28+
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
29+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
30+
; CHECK-NEXT: ret void
31+
;
32+
entry:
33+
%something = alloca [2 x [3 x i8]], align 1
34+
%call1 = call i32 @bar(ptr nonnull %something)
35+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
36+
ret void
37+
}
38+
39+
declare i32 @bar(...)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
; CHECK: [3 x i8]
5+
@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
6+
; CHECK: [4 x i8]
7+
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
8+
9+
define void @memcpy_multiple() {
10+
; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
11+
; CHECK-NEXT: [[ENTRY:.*:]]
12+
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [4 x i8], align 1
13+
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
14+
; CHECK-NEXT: [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
15+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
16+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
17+
; CHECK-NEXT: ret void
18+
;
19+
entry:
20+
%something = alloca [3 x i8], align 1
21+
%call1 = call i32 @bar(ptr nonnull %something)
22+
%call2 = call i32 @bar(ptr nonnull @other)
23+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
24+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
25+
ret void
26+
}
27+
28+
declare i32 @bar(...)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
3+
4+
@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
5+
6+
define void @memcpy_i16_array() {
7+
; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
8+
; CHECK-NEXT: [[ENTRY:.*:]]
9+
; CHECK-NEXT: [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
10+
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
11+
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
12+
; CHECK-NEXT: ret void
13+
;
14+
entry:
15+
%something = alloca [5 x i16], align 1
16+
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
17+
%call2 = call i32 @bar(ptr nonnull %something)
18+
ret void
19+
}
20+
21+
22+
declare i32 @bar(...)

0 commit comments

Comments
 (0)