Skip to content

Commit b3bca66

Browse files
committed
[llvm][ARM]Add ARM widen strings pass
- Pass optimizes memcpy's by padding out destinations and sources to a full word to make ARM backend generate full word loads instead of loading a single byte (ldrb) and/or half word (ldrh). Only pads destination when it's a stack allocated constant size array and source when it's constant string. Heuristic to decide whether to pad or not is very basic and could be improved to allow more examples to be padded. - Pass works at the midend level instead of being added in overridden method ARMPassConfig::addIRPasses(). This is because addIRPasses are run right at the end just before the llvm midend IR is lowered into the SelectionDag IR. This pass works better if it is in the midend because other optimizations such as dead code elimination can be run afterwards and delete the old unreferenced global string that has been replaced with the padded version. The other reason it's better in the midend is that it makes writing the tests easier as opt is able to run midend level passes. None the less, the pass checks if the it's being run on code targeted with an ARM triple if not then it doesn't run. Change-Id: I1c6371f0962e7ad3c166602b800d041ac1cc7b04
1 parent 2a9f93b commit b3bca66

13 files changed

+514
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
//===- ARMWidenStrings.h --------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file provides the interface for the ArmWidenStrings pass
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
14+
#define LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H
15+
16+
#include "llvm/IR/PassManager.h"
17+
18+
namespace llvm {
19+
20+
class Module;
21+
22+
struct ARMWidenStringsPass : PassInfoMixin<ARMWidenStringsPass> {
23+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &);
24+
};
25+
26+
} // end namespace llvm
27+
28+
#endif // LLVM_TRANSFORMS_SCALAR_ARMWIDENSTRINGS_H

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@
207207
#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
208208
#include "llvm/Transforms/ObjCARC.h"
209209
#include "llvm/Transforms/Scalar/ADCE.h"
210+
#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
210211
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
211212
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
212213
#include "llvm/Transforms/Scalar/BDCE.h"

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@
8080
#include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
8181
#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
8282
#include "llvm/Transforms/Scalar/ADCE.h"
83+
#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
8384
#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
8485
#include "llvm/Transforms/Scalar/AnnotationRemarks.h"
8586
#include "llvm/Transforms/Scalar/BDCE.h"
@@ -1513,6 +1514,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
15131514
// from the TargetLibraryInfo.
15141515
OptimizePM.addPass(InjectTLIMappings());
15151516

1517+
bool IsARM = TM && TM->getTargetTriple().isARM();
1518+
// Optimizes memcpy by padding arrays to exploit alignment
1519+
if (IsARM && Level.getSizeLevel() == 0 && Level.getSpeedupLevel() > 1)
1520+
OptimizePM.addPass(ARMWidenStringsPass());
1521+
15161522
addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false);
15171523

15181524
// LoopSink pass sinks instructions hoisted by LICM, which serves as a

llvm/lib/Passes/PassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,7 @@ FUNCTION_PASS("view-dom-only", DomOnlyViewer())
489489
FUNCTION_PASS("view-post-dom", PostDomViewer())
490490
FUNCTION_PASS("view-post-dom-only", PostDomOnlyViewer())
491491
FUNCTION_PASS("wasm-eh-prepare", WasmEHPreparePass())
492+
FUNCTION_PASS("arm-widen-strings", ARMWidenStringsPass())
492493
#undef FUNCTION_PASS
493494

494495
#ifndef FUNCTION_PASS_WITH_PARAMS
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
// ARMWidenStrings.cpp - Widen strings to word boundaries to speed up
2+
// programs that use simple strcpy's with constant strings as source
3+
// and stack allocated array for destination.
4+
5+
#define DEBUG_TYPE "arm-widen-strings"
6+
7+
#include "llvm/Transforms/Scalar/ARMWidenStrings.h"
8+
#include "llvm/Analysis/LoopInfo.h"
9+
#include "llvm/IR/BasicBlock.h"
10+
#include "llvm/IR/Constants.h"
11+
#include "llvm/IR/Function.h"
12+
#include "llvm/IR/GlobalVariable.h"
13+
#include "llvm/IR/IRBuilder.h"
14+
#include "llvm/IR/Instructions.h"
15+
#include "llvm/IR/Intrinsics.h"
16+
#include "llvm/IR/Module.h"
17+
#include "llvm/IR/Operator.h"
18+
#include "llvm/IR/ValueSymbolTable.h"
19+
#include "llvm/Pass.h"
20+
#include "llvm/Support/CommandLine.h"
21+
#include "llvm/Support/Debug.h"
22+
#include "llvm/Support/raw_ostream.h"
23+
#include "llvm/TargetParser/Triple.h"
24+
#include "llvm/Transforms/Scalar.h"
25+
26+
using namespace llvm;
27+
28+
cl::opt<bool> DisableARMWidenStrings("disable-arm-widen-strings");
29+
30+
namespace {
31+
32+
class ARMWidenStrings {
33+
public:
34+
/*
35+
Max number of bytes that memcpy allows for lowering to load/stores before it
36+
uses library function (__aeabi_memcpy). This is the same value returned by
37+
ARMSubtarget::getMaxInlineSizeThreshold which I would have called in place of
38+
the constant int but can't get access to the subtarget info class from the
39+
midend.
40+
*/
41+
const unsigned int MemcpyInliningLimit = 64;
42+
43+
bool run(Function &F);
44+
};
45+
46+
static bool IsCharArray(Type *t) {
47+
const unsigned int CHAR_BIT_SIZE = 8;
48+
return t && t->isArrayTy() && t->getArrayElementType()->isIntegerTy() &&
49+
t->getArrayElementType()->getIntegerBitWidth() == CHAR_BIT_SIZE;
50+
}
51+
52+
bool ARMWidenStrings::run(Function &F) {
53+
if (DisableARMWidenStrings) {
54+
return false;
55+
}
56+
57+
if (Triple(F.getParent()->getTargetTriple()).isARM()) {
58+
LLVM_DEBUG(
59+
dbgs() << "Pass only runs on ARM as hasn't been benchmarked on other "
60+
"targets\n");
61+
return false;
62+
}
63+
LLVM_DEBUG(dbgs() << "Running ARMWidenStrings on module " << F.getName()
64+
<< "\n");
65+
66+
for (Function::iterator b = F.begin(); b != F.end(); ++b) {
67+
for (BasicBlock::iterator i = b->begin(); i != b->end(); ++i) {
68+
CallInst *CI = dyn_cast<CallInst>(i);
69+
if (!CI) {
70+
continue;
71+
}
72+
73+
Function *CallMemcpy = CI->getCalledFunction();
74+
// find out if the current call instruction is a call to llvm memcpy
75+
// intrinsics
76+
if (CallMemcpy == NULL || !CallMemcpy->isIntrinsic() ||
77+
CallMemcpy->getIntrinsicID() != Intrinsic::memcpy) {
78+
continue;
79+
}
80+
81+
LLVM_DEBUG(dbgs() << "Found call to strcpy/memcpy:\n" << *CI << "\n");
82+
83+
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
84+
auto *SourceVar = dyn_cast<GlobalVariable>(CI->getArgOperand(1));
85+
auto *BytesToCopy = dyn_cast<ConstantInt>(CI->getArgOperand(2));
86+
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
87+
88+
if (!BytesToCopy) {
89+
LLVM_DEBUG(dbgs() << "Number of bytes to copy is null\n");
90+
continue;
91+
}
92+
93+
uint64_t NumBytesToCopy = BytesToCopy->getZExtValue();
94+
95+
if (!Alloca) {
96+
LLVM_DEBUG(dbgs() << "Destination isn't a Alloca\n");
97+
continue;
98+
}
99+
100+
if (!SourceVar) {
101+
LLVM_DEBUG(dbgs() << "Source isn't a global constant variable\n");
102+
continue;
103+
}
104+
105+
if (!IsVolatile || IsVolatile->isOne()) {
106+
LLVM_DEBUG(
107+
dbgs() << "Not widening strings for this memcpy because it's "
108+
"a volatile operations\n");
109+
continue;
110+
}
111+
112+
if (NumBytesToCopy % 4 == 0) {
113+
LLVM_DEBUG(dbgs() << "Bytes to copy in strcpy/memcpy is already word "
114+
"aligned so nothing to do here.\n");
115+
continue;
116+
}
117+
118+
if (!SourceVar->hasInitializer() || !SourceVar->isConstant() ||
119+
!SourceVar->hasLocalLinkage() || !SourceVar->hasGlobalUnnamedAddr()) {
120+
LLVM_DEBUG(dbgs() << "Source is not constant global, thus it's "
121+
"mutable therefore it's not safe to pad\n");
122+
continue;
123+
}
124+
125+
ConstantDataArray *SourceDataArray =
126+
dyn_cast<ConstantDataArray>(SourceVar->getInitializer());
127+
if (!SourceDataArray || !IsCharArray(SourceDataArray->getType())) {
128+
LLVM_DEBUG(dbgs() << "Source isn't a constant data array\n");
129+
continue;
130+
}
131+
132+
if (!Alloca->isStaticAlloca()) {
133+
LLVM_DEBUG(dbgs() << "Destination allocation isn't a static "
134+
"constant which is locally allocated in this "
135+
"function, so skipping.\n");
136+
continue;
137+
}
138+
139+
// Make sure destination is definitley a char array.
140+
if (!IsCharArray(Alloca->getAllocatedType())) {
141+
LLVM_DEBUG(dbgs() << "Destination doesn't look like a constant char (8 "
142+
"bits) array\n");
143+
continue;
144+
}
145+
LLVM_DEBUG(dbgs() << "With Alloca: " << *Alloca << "\n");
146+
147+
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
148+
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
149+
150+
// For safety purposes lets add a constraint and only padd when
151+
// num bytes to copy == destination array size == source string
152+
// which is a constant
153+
LLVM_DEBUG(dbgs() << "Number of bytes to copy is: " << NumBytesToCopy
154+
<< "\n");
155+
LLVM_DEBUG(dbgs() << "Size of destination array is: " << DZSize << "\n");
156+
LLVM_DEBUG(dbgs() << "Size of source array is: " << SZSize << "\n");
157+
if (NumBytesToCopy != DZSize || DZSize != SZSize) {
158+
LLVM_DEBUG(dbgs() << "Size of number of bytes to copy, destination "
159+
"array and source string don't match, so "
160+
"skipping\n");
161+
continue;
162+
}
163+
LLVM_DEBUG(dbgs() << "Going to widen.\n");
164+
unsigned int NumBytesToPad = 4 - (NumBytesToCopy % 4);
165+
LLVM_DEBUG(dbgs() << "Number of bytes to pad by is " << NumBytesToPad
166+
<< "\n");
167+
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
168+
169+
if (TotalBytes > MemcpyInliningLimit) {
170+
LLVM_DEBUG(
171+
dbgs() << "Not going to pad because total number of bytes is "
172+
<< TotalBytes
173+
<< " which be greater than the inlining "
174+
"limit for memcpy which is "
175+
<< MemcpyInliningLimit << "\n");
176+
continue;
177+
}
178+
179+
// update destination char array to be word aligned (memcpy(X,...,...))
180+
IRBuilder<> BuildAlloca(Alloca);
181+
AllocaInst *NewAlloca = cast<AllocaInst>(BuildAlloca.CreateAlloca(
182+
ArrayType::get(Alloca->getAllocatedType()->getArrayElementType(),
183+
NumBytesToCopy + NumBytesToPad)));
184+
NewAlloca->takeName(Alloca);
185+
NewAlloca->setAlignment(Alloca->getAlign());
186+
Alloca->replaceAllUsesWith(NewAlloca);
187+
188+
LLVM_DEBUG(dbgs() << "Updating users of destination stack object to use "
189+
<< "new size\n");
190+
191+
// update source to be word aligned (memcpy(...,X,...))
192+
// create replacement string with padded null bytes.
193+
StringRef Data = SourceDataArray->getRawDataValues();
194+
std::vector<uint8_t> StrData(Data.begin(), Data.end());
195+
for (unsigned int p = 0; p < NumBytesToPad; p++)
196+
StrData.push_back('\0');
197+
auto Arr = ArrayRef(StrData.data(), TotalBytes);
198+
199+
// create new padded version of global variable string.
200+
Constant *SourceReplace = ConstantDataArray::get(F.getContext(), Arr);
201+
GlobalVariable *NewGV = new GlobalVariable(
202+
*F.getParent(), SourceReplace->getType(), true,
203+
SourceVar->getLinkage(), SourceReplace, SourceReplace->getName());
204+
205+
// copy any other attributes from original global variable string
206+
// e.g. unamed_addr
207+
NewGV->copyAttributesFrom(SourceVar);
208+
NewGV->takeName(SourceVar);
209+
210+
// replace intrinsic source.
211+
CI->setArgOperand(1, NewGV);
212+
213+
// Update number of bytes to copy (memcpy(...,...,X))
214+
CI->setArgOperand(2,
215+
ConstantInt::get(BytesToCopy->getType(), TotalBytes));
216+
LLVM_DEBUG(dbgs() << "Padded dest/source and increased number of bytes:\n"
217+
<< *CI << "\n"
218+
<< *NewAlloca << "\n");
219+
}
220+
}
221+
return true;
222+
}
223+
224+
} // end of anonymous namespace
225+
226+
PreservedAnalyses ARMWidenStringsPass::run(Function &F,
227+
FunctionAnalysisManager &AM) {
228+
bool Changed = ARMWidenStrings().run(F);
229+
if (!Changed)
230+
return PreservedAnalyses::all();
231+
232+
PreservedAnalyses Preserved;
233+
Preserved.preserveSet(CFGAnalyses::ID());
234+
Preserved.preserve<LoopAnalysis>();
235+
return Preserved;
236+
}

llvm/lib/Transforms/Scalar/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ add_llvm_component_library(LLVMScalarOpts
22
ADCE.cpp
33
AlignmentFromAssumptions.cpp
44
AnnotationRemarks.cpp
5+
ARMWidenStrings.cpp
56
BDCE.cpp
67
CallSiteSplitting.cpp
78
ConstantHoisting.cpp
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -O2 -S | FileCheck %s
2+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O2>" -S | FileCheck %s
3+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4+
target triple = "thumbv6m-arm-none-eabi"
5+
6+
; CHECK: [17 x i8]
7+
@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
8+
9+
; Function Attrs: nounwind
10+
define hidden void @foo() local_unnamed_addr #0 {
11+
entry:
12+
%something = alloca [20 x i8], align 1
13+
call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
14+
call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
15+
%call2 = call i32 @bar(ptr nonnull %something) #3
16+
call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
17+
ret void
18+
}
19+
20+
; Function Attrs: argmemonly nounwind
21+
declare void @llvm.lifetime.start(i64, ptr nocapture) #1
22+
23+
declare i32 @bar(...) local_unnamed_addr #2
24+
25+
; Function Attrs: argmemonly nounwind
26+
declare void @llvm.lifetime.end(i64, ptr nocapture) #1
27+
28+
; Function Attrs: argmemonly nounwind
29+
declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -O3 -S | FileCheck %s
2+
; RUN: opt < %s -mtriple=arm-arm-none-eabi -passes="default<O3>" -S | FileCheck %s
3+
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4+
target triple = "thumbv6m-arm-none-eabi"
5+
6+
; CHECK: [65 x i8]
7+
; CHECK-NOT: [68 x i8]
8+
@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
9+
10+
; Function Attrs: nounwind
11+
define hidden void @foo() local_unnamed_addr #0 {
12+
entry:
13+
%something = alloca [65 x i8], align 1
14+
call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
15+
call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
16+
%call2 = call i32 @bar(ptr nonnull %something) #3
17+
call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
18+
ret void
19+
}
20+
21+
; Function Attrs: argmemonly nounwind
22+
declare void @llvm.lifetime.start(i64, ptr nocapture) #1
23+
24+
declare i32 @bar(...) local_unnamed_addr #2
25+
26+
; Function Attrs: argmemonly nounwind
27+
declare void @llvm.lifetime.end(i64, ptr nocapture) #1
28+
29+
; Function Attrs: argmemonly nounwind
30+
declare void @llvm.memcpy.p0i8.p0i8.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1) #1

0 commit comments

Comments
 (0)