Skip to content

Commit 437765e

Browse files
committed
LICM: split loads that are wider than the loop-stored value.
For combined load-store hoisting, split loads that contain the loop-stored value into a single load from the same address as the loop-stores, and a set of loads disjoint from the loop-stores. The single load will be hoisted while sinking the stores to the same address. The disjoint loads will be hoisted normally in a subsequent iteration on the same loop. loop: load %outer store %inner1 exit: Will be split into loop: load %inner1 load %inner2 store %inner1 exit: Then, combined load/store hoisting will produce: load %inner1 loop: load %inner2 exit: store %inner1
1 parent d86099f commit 437765e

File tree

2 files changed

+603
-14
lines changed

2 files changed

+603
-14
lines changed

lib/SILOptimizer/LoopTransforms/LICM.cpp

Lines changed: 207 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ class LoopTreeOptimization {
505505
llvm::DenseMap<SILLoop *, std::unique_ptr<LoopNestSummary>>
506506
LoopNestSummaryMap;
507507
SmallVector<SILLoop *, 8> BotUpWorkList;
508+
InstSet toDelete;
508509
SILLoopInfo *LoopInfo;
509510
AliasAnalysis *AA;
510511
SideEffectAnalysis *SEA;
@@ -525,6 +526,8 @@ class LoopTreeOptimization {
525526
InstVector SinkDown;
526527

527528
/// Load and store instructions that we may be able to move out of the loop.
529+
/// All loads and stores within a block must be in instruction order to
530+
/// simplify replacement of values after SSA update.
528531
InstVector LoadsAndStores;
529532

530533
/// All access paths of the \p LoadsAndStores instructions.
@@ -561,12 +564,22 @@ class LoopTreeOptimization {
561564
/// Collect a set of instructions that can be hoisted
562565
void analyzeCurrentLoop(std::unique_ptr<LoopNestSummary> &CurrSummary);
563566

567+
SingleValueInstruction *splitLoad(SILValue splitAddress,
568+
ArrayRef<AccessPath::Index> remainingPath,
569+
SILBuilder &builder,
570+
SmallVectorImpl<LoadInst *> &Loads,
571+
unsigned ldStIdx);
572+
573+
/// Given an \p accessPath that is only loaded and stored, split loads that
574+
/// are wider than \p accessPath.
575+
bool splitLoads(SmallVectorImpl<LoadInst *> &Loads, AccessPath accessPath,
576+
SILValue storeAddr);
577+
564578
/// Optimize the current loop nest.
565579
bool optimizeLoop(std::unique_ptr<LoopNestSummary> &CurrSummary);
566580

567-
/// Move all loads and stores from/to \p access out of the \p loop.
568-
void hoistLoadsAndStores(AccessPath accessPath, SILLoop *loop,
569-
InstVector &toDelete);
581+
/// Move all loads and stores from/to \p accessPath out of the \p loop.
582+
void hoistLoadsAndStores(AccessPath accessPath, SILLoop *loop);
570583

571584
/// Move all loads and stores from all addresses in LoadAndStoreAddrs out of
572585
/// the \p loop.
@@ -799,6 +812,8 @@ static bool analyzeBeginAccess(BeginAccessInst *BI,
799812
// We *need* to discover all SideEffectInsts -
800813
// even if the loop is otherwise skipped!
801814
// This is because outer loops will depend on the inner loop's writes.
815+
//
816+
// This may split some loads into smaller loads.
802817
void LoopTreeOptimization::analyzeCurrentLoop(
803818
std::unique_ptr<LoopNestSummary> &CurrSummary) {
804819
InstSet &sideEffects = CurrSummary->SideEffectInsts;
@@ -915,15 +930,22 @@ void LoopTreeOptimization::analyzeCurrentLoop(
915930

916931
// Collect memory locations for which we can move all loads and stores out
917932
// of the loop.
933+
//
934+
// Note: The Loads set and LoadsAndStores set may mutate during this loop.
918935
for (StoreInst *SI : Stores) {
919936
// Use AccessPathWithBase to recover a base address that can be used for
920937
// newly inserted memory operations. If we instead teach hoistLoadsAndStores
921938
// how to rematerialize global_addr, then we don't need this base.
922939
auto access = AccessPathWithBase::compute(SI->getDest());
923-
if (access.accessPath.isValid() && isLoopInvariant(access.base, Loop)) {
940+
auto accessPath = access.accessPath;
941+
if (accessPath.isValid() && isLoopInvariant(access.base, Loop)) {
924942
if (isOnlyLoadedAndStored(AA, sideEffects, Loads, Stores, SI->getDest(),
925-
access.accessPath)) {
926-
LoadAndStoreAddrs.insert(accessPath);
943+
accessPath)) {
944+
if (!LoadAndStoreAddrs.count(accessPath)) {
945+
if (splitLoads(Loads, accessPath, SI->getDest())) {
946+
LoadAndStoreAddrs.insert(accessPath);
947+
}
948+
}
927949
}
928950
}
929951
}
@@ -950,6 +972,172 @@ void LoopTreeOptimization::analyzeCurrentLoop(
950972
}
951973
}
952974

975+
// Recursively determine whether the innerAddress is a direct tuple or struct
976+
// projection chain from outerPath. Populate \p reversePathIndices with the path
977+
// difference.
978+
static bool
979+
computeInnerAccessPath(AccessPath::PathNode outerPath,
980+
AccessPath::PathNode innerPath, SILValue innerAddress,
981+
SmallVectorImpl<AccessPath::Index> &reversePathIndices) {
982+
if (outerPath == innerPath)
983+
return true;
984+
985+
if (!isa<StructElementAddrInst>(innerAddress)
986+
&& !isa<TupleElementAddrInst>(innerAddress)) {
987+
return false;
988+
}
989+
assert(ProjectionIndex(innerAddress).Index
990+
== innerPath.getIndex().getSubObjectIndex());
991+
992+
reversePathIndices.push_back(innerPath.getIndex());
993+
SILValue srcAddr = cast<SingleValueInstruction>(innerAddress)->getOperand(0);
994+
if (!computeInnerAccessPath(outerPath, innerPath.getParent(), srcAddr,
995+
reversePathIndices)) {
996+
return false;
997+
}
998+
return true;
999+
}
1000+
1001+
/// Split a load from \p outerAddress recursively following remainingPath.
1002+
///
1003+
/// Creates a load with identical \p accessPath and a set of
1004+
/// non-overlapping loads. Add the new non-overlapping loads to HoistUp.
1005+
///
1006+
/// \p ldstIdx is the index into LoadsAndStores of the original outer load.
1007+
///
1008+
/// Return the aggregate produced by merging the loads.
1009+
SingleValueInstruction *LoopTreeOptimization::splitLoad(
1010+
SILValue splitAddress, ArrayRef<AccessPath::Index> remainingPath,
1011+
SILBuilder &builder, SmallVectorImpl<LoadInst *> &Loads, unsigned ldstIdx) {
1012+
auto loc = LoadsAndStores[ldstIdx]->getLoc();
1013+
// Recurse until we have a load that matches accessPath.
1014+
if (remainingPath.empty()) {
1015+
// Create a load that matches the stored access path.
1016+
LoadInst *load = builder.createLoad(loc, splitAddress,
1017+
LoadOwnershipQualifier::Unqualified);
1018+
Loads.push_back(load);
1019+
// Replace the outer load in the list of loads and stores to hoist and
1020+
// sink. LoadsAndStores must remain in instruction order.
1021+
LoadsAndStores[ldstIdx] = load;
1022+
LLVM_DEBUG(llvm::dbgs() << "Created load from stored path: " << *load);
1023+
return load;
1024+
}
1025+
auto recordDisjointLoad = [&](LoadInst *newLoad) {
1026+
Loads.push_back(newLoad);
1027+
LoadsAndStores.insert(LoadsAndStores.begin() + ldstIdx + 1, newLoad);
1028+
};
1029+
auto subIndex = remainingPath.back().getSubObjectIndex();
1030+
SILType loadTy = splitAddress->getType();
1031+
if (CanTupleType tupleTy = loadTy.getAs<TupleType>()) {
1032+
SmallVector<SILValue, 4> elements;
1033+
for (int tupleIdx : range(tupleTy->getNumElements())) {
1034+
auto *projection = builder.createTupleElementAddr(
1035+
loc, splitAddress, tupleIdx, loadTy.getTupleElementType(tupleIdx));
1036+
SILValue elementVal;
1037+
if (tupleIdx == subIndex) {
1038+
elementVal = splitLoad(projection, remainingPath.drop_back(), builder,
1039+
Loads, ldstIdx);
1040+
} else {
1041+
elementVal = builder.createLoad(loc, projection,
1042+
LoadOwnershipQualifier::Unqualified);
1043+
recordDisjointLoad(cast<LoadInst>(elementVal));
1044+
}
1045+
elements.push_back(elementVal);
1046+
}
1047+
return builder.createTuple(loc, elements);
1048+
}
1049+
auto structTy = loadTy.getStructOrBoundGenericStruct();
1050+
assert(structTy && "tuple and struct elements are checked earlier");
1051+
auto &module = builder.getModule();
1052+
auto expansionContext = builder.getFunction().getTypeExpansionContext();
1053+
1054+
SmallVector<SILValue, 4> elements;
1055+
int fieldIdx = 0;
1056+
for (auto *field : structTy->getStoredProperties()) {
1057+
SILType fieldTy = loadTy.getFieldType(field, module, expansionContext);
1058+
auto *projection =
1059+
builder.createStructElementAddr(loc, splitAddress, field, fieldTy);
1060+
SILValue fieldVal;
1061+
if (fieldIdx++ == subIndex)
1062+
fieldVal = splitLoad(projection, remainingPath.drop_back(), builder,
1063+
Loads, ldstIdx);
1064+
else {
1065+
fieldVal = builder.createLoad(loc, projection,
1066+
LoadOwnershipQualifier::Unqualified);
1067+
recordDisjointLoad(cast<LoadInst>(fieldVal));
1068+
}
1069+
elements.push_back(fieldVal);
1070+
}
1071+
return builder.createStruct(loc, loadTy.getObjectType(), elements);
1072+
}
1073+
1074+
/// Find all loads that contain \p accessPath. Split them into a load with
1075+
/// identical accessPath and a set of non-overlapping loads. Add the new
1076+
/// non-overlapping loads to LoadsAndStores and HoistUp.
1077+
///
1078+
/// TODO: The \p storeAddr parameter is only needed until we have an
1079+
/// AliasAnalysis interface that handles AccessPath.
1080+
bool LoopTreeOptimization::splitLoads(SmallVectorImpl<LoadInst *> &Loads,
1081+
AccessPath accessPath,
1082+
SILValue storeAddr) {
1083+
// The Loads set may mutate during this loop, but we only want to visit the
1084+
// original set.
1085+
for (unsigned loadsIdx = 0, endIdx = Loads.size(); loadsIdx != endIdx;
1086+
++loadsIdx) {
1087+
auto *load = Loads[loadsIdx];
1088+
if (toDelete.count(load))
1089+
continue;
1090+
1091+
if (!AA->mayReadFromMemory(load, storeAddr))
1092+
continue;
1093+
1094+
AccessPath loadAccessPath = AccessPath::compute(load->getOperand());
1095+
if (accessPath.contains(loadAccessPath))
1096+
continue;
1097+
1098+
assert(loadAccessPath.contains(accessPath));
1099+
LLVM_DEBUG(llvm::dbgs() << "Overlaps with loop stores: " << *load);
1100+
SmallVector<AccessPath::Index, 4> reversePathIndices;
1101+
if (!computeInnerAccessPath(loadAccessPath.getPathNode(),
1102+
accessPath.getPathNode(), storeAddr,
1103+
reversePathIndices)) {
1104+
return false;
1105+
}
1106+
// Found a load wider than the store to accessPath.
1107+
//
1108+
// SplitLoads is called for each unique access path in the loop that is
1109+
// only loaded from and stored to and this loop takes time proportional to:
1110+
// num-wide-loads x num-fields x num-loop-memops
1111+
//
1112+
// For each load wider than the store, it creates a new load for each field
1113+
// in that type. Each new load is inserted in the LoadsAndStores vector. To
1114+
// avoid super-linear behavior for large types (e.g. giant tuples), limit
1115+
// growth of new loads to an arbitrary constant factor per access path.
1116+
if (Loads.size() >= endIdx + 6) {
1117+
LLVM_DEBUG(llvm::dbgs() << "...Refusing to split more loads\n");
1118+
return false;
1119+
}
1120+
LLVM_DEBUG(llvm::dbgs() << "...Splitting load\n");
1121+
1122+
unsigned ldstIdx = [this, load]() {
1123+
auto ldstIter = llvm::find(LoadsAndStores, load);
1124+
assert(ldstIter != LoadsAndStores.end() && "outerLoad missing");
1125+
return std::distance(LoadsAndStores.begin(), ldstIter);
1126+
}();
1127+
1128+
SILBuilderWithScope builder(load);
1129+
1130+
SILValue aggregateVal = splitLoad(load->getOperand(), reversePathIndices,
1131+
builder, Loads, ldstIdx);
1132+
1133+
load->replaceAllUsesWith(aggregateVal);
1134+
auto iterAndInserted = toDelete.insert(load);
1135+
(void)iterAndInserted;
1136+
assert(iterAndInserted.second && "the same load should only be split once");
1137+
}
1138+
return true;
1139+
}
1140+
9531141
bool LoopTreeOptimization::optimizeLoop(
9541142
std::unique_ptr<LoopNestSummary> &CurrSummary) {
9551143
auto *CurrentLoop = CurrSummary->Loop;
@@ -964,6 +1152,8 @@ bool LoopTreeOptimization::optimizeLoop(
9641152
currChanged |= sinkInstructions(CurrSummary, DomTree, LoopInfo, SinkDown);
9651153
currChanged |=
9661154
hoistSpecialInstruction(CurrSummary, DomTree, LoopInfo, SpecialHoist);
1155+
1156+
assert(toDelete.empty() && "only hostAllLoadsAndStores deletes");
9671157
return currChanged;
9681158
}
9691159

@@ -1089,8 +1279,8 @@ storesCommonlyDominateLoopExits(AccessPath accessPath,
10891279
return true;
10901280
}
10911281

1092-
void LoopTreeOptimization::hoistLoadsAndStores(
1093-
AccessPath accessPath, SILLoop *loop, InstVector &toDelete) {
1282+
void LoopTreeOptimization::
1283+
hoistLoadsAndStores(AccessPath accessPath, SILLoop *loop) {
10941284
SmallVector<SILBasicBlock *, 4> exitingAndLatchBlocks;
10951285
loop->getExitingAndLatchBlocks(exitingAndLatchBlocks);
10961286

@@ -1171,7 +1361,7 @@ void LoopTreeOptimization::hoistLoadsAndStores(
11711361
if (auto *SI = isStoreToAccess(I, accessPath)) {
11721362
LLVM_DEBUG(llvm::dbgs() << "Deleting reloaded store " << *SI);
11731363
currentVal = SI->getSrc();
1174-
toDelete.push_back(SI);
1364+
toDelete.insert(SI);
11751365
continue;
11761366
}
11771367
auto loadWithAccess = isLoadWithinAccess(I, accessPath);
@@ -1190,7 +1380,7 @@ void LoopTreeOptimization::hoistLoadsAndStores(
11901380
LLVM_DEBUG(llvm::dbgs() << "Replacing stored load " << *load << " with "
11911381
<< projectedValue);
11921382
load->replaceAllUsesWith(projectedValue);
1193-
toDelete.push_back(load);
1383+
toDelete.insert(load);
11941384
}
11951385

11961386
// Store back the value at all loop exits.
@@ -1215,17 +1405,20 @@ void LoopTreeOptimization::hoistLoadsAndStores(
12151405
}
12161406

12171407
bool LoopTreeOptimization::hoistAllLoadsAndStores(SILLoop *loop) {
1218-
InstVector toDelete;
12191408
for (AccessPath accessPath : LoadAndStoreAddrs) {
1220-
hoistLoadsAndStores(accessPath, loop, toDelete);
1409+
hoistLoadsAndStores(accessPath, loop);
12211410
}
12221411
LoadsAndStores.clear();
12231412
LoadAndStoreAddrs.clear();
12241413

1414+
if (toDelete.empty())
1415+
return false;
1416+
12251417
for (SILInstruction *I : toDelete) {
1226-
I->eraseFromParent();
1418+
recursivelyDeleteTriviallyDeadInstructions(I, /*force*/ true);
12271419
}
1228-
return !toDelete.empty();
1420+
toDelete.clear();
1421+
return true;
12291422
}
12301423

12311424
namespace {

0 commit comments

Comments
 (0)