Skip to content

Commit 92ddd3c

Browse files
committed
[loop-idiom] Hoist loop memcpys to loop preheader
For a simple loop like: ``` struct S { int x; int y; char b; }; unsigned foo(S* __restrict__ a, S* b, int n) { for (int i = 0; i < n; i++) a[i] = b[i]; return sizeof(a[0]); } ``` We could eliminate the loop and convert it to a large memcpy of 12*n bytes. Currently this is not handled. Output of `opt -loop-idiom -S < memcpy_before.ll` ``` %struct.S = type { i32, i32, i8 } define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry br label %for.body for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry ret i32 12 for.body: ; preds = %for.body, %for.body.preheader %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %idxprom = zext i32 %i.08 to i64 %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom %0 = bitcast %struct.S* %arrayidx2 to i8* %1 = bitcast %struct.S* %arrayidx to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false) %inc = add nuw nsw i32 %i.08, 1 %cmp = icmp slt i32 %inc, %n br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit } ; Function Attrs: argmemonly nofree nosync nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0 attributes #0 = { argmemonly nofree nosync nounwind willreturn } ``` The loop idiom pass currently only handles load and store instructions. Since struct S is too big to fit in a register, the loop body contains a memcpy intrinsic. With this change, re-run `opt -loop-idiom -S < memcpy_before.ll`. The loop memcpy is promoted to loop preheader. For this trivial case, the loop is dead and will be removed by another pass. ``` %struct.S = type { i32, i32, i8 } define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr { entry: %a1 = bitcast %struct.S* %a to i8* %b2 = bitcast %struct.S* %b to i8* %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %0 = zext i32 %n to i64 %1 = mul nuw nsw i64 %0, 12 call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a1, i8* align 4 %b2, i64 %1, i1 false) br label %for.body for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry ret i32 12 for.body: ; preds = %for.body, %for.body.preheader %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %idxprom = zext i32 %i.08 to i64 %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom %2 = bitcast %struct.S* %arrayidx2 to i8* %3 = bitcast %struct.S* %arrayidx to i8* %inc = add nuw nsw i32 %i.08, 1 %cmp = icmp slt i32 %inc, %n br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit } ; Function Attrs: argmemonly nofree nosync nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0 attributes #0 = { argmemonly nofree nosync nounwind willreturn } ``` Reviewed By: zino Differential Revision: https://reviews.llvm.org/D97667
1 parent 2bd4049 commit 92ddd3c

File tree

4 files changed

+468
-45
lines changed

4 files changed

+468
-45
lines changed

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Lines changed: 157 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,13 @@ class LoopIdiomRecognize {
205205
enum class ForMemset { No, Yes };
206206
bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
207207
ForMemset For);
208+
209+
template <typename MemInst>
210+
bool processLoopMemIntrinsic(
211+
BasicBlock *BB,
212+
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
213+
const SCEV *BECount);
214+
bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
208215
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
209216

210217
bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
@@ -214,6 +221,13 @@ class LoopIdiomRecognize {
214221
const SCEVAddRecExpr *Ev, const SCEV *BECount,
215222
bool NegStride, bool IsLoopMemset = false);
216223
bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
224+
bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
225+
unsigned StoreSize, MaybeAlign StoreAlign,
226+
MaybeAlign LoadAlign, Instruction *TheStore,
227+
Instruction *TheLoad,
228+
const SCEVAddRecExpr *StoreEv,
229+
const SCEVAddRecExpr *LoadEv,
230+
const SCEV *BECount);
217231
bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
218232
bool IsLoopMemset = false);
219233

@@ -628,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock(
628642
for (auto &SI : StoreRefsForMemcpy)
629643
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
630644

631-
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
632-
Instruction *Inst = &*I++;
633-
// Look for memset instructions, which may be optimized to a larger memset.
634-
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
635-
WeakTrackingVH InstPtr(&*I);
636-
if (!processLoopMemSet(MSI, BECount))
637-
continue;
638-
MadeChange = true;
639-
640-
// If processing the memset invalidated our iterator, start over from the
641-
// top of the block.
642-
if (!InstPtr)
643-
I = BB->begin();
644-
continue;
645-
}
646-
}
645+
MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
646+
BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
647+
MadeChange |= processLoopMemIntrinsic<MemSetInst>(
648+
BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
647649

648650
return MadeChange;
649651
}
@@ -792,6 +794,80 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
792794
return Changed;
793795
}
794796

797+
/// processLoopMemIntrinsic - Template function for calling different processor
798+
/// functions based on mem instrinsic type.
799+
template <typename MemInst>
800+
bool LoopIdiomRecognize::processLoopMemIntrinsic(
801+
BasicBlock *BB,
802+
bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
803+
const SCEV *BECount) {
804+
bool MadeChange = false;
805+
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
806+
Instruction *Inst = &*I++;
807+
// Look for memory instructions, which may be optimized to a larger one.
808+
if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
809+
WeakTrackingVH InstPtr(&*I);
810+
if (!(this->*Processor)(MI, BECount))
811+
continue;
812+
MadeChange = true;
813+
814+
// If processing the instruction invalidated our iterator, start over from
815+
// the top of the block.
816+
if (!InstPtr)
817+
I = BB->begin();
818+
continue;
819+
}
820+
}
821+
return MadeChange;
822+
}
823+
824+
/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
825+
bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
826+
const SCEV *BECount) {
827+
// We can only handle non-volatile memcpys with a constant size.
828+
if (MCI->isVolatile() || !isa<ConstantInt>(MCI->getLength()))
829+
return false;
830+
831+
// If we're not allowed to hack on memcpy, we fail.
832+
if (!HasMemcpy || DisableLIRP::Memcpy)
833+
return false;
834+
835+
Value *Dest = MCI->getDest();
836+
Value *Source = MCI->getSource();
837+
if (!Dest || !Source)
838+
return false;
839+
840+
// See if the load and store pointer expressions are AddRec like {base,+,1} on
841+
// the current loop, which indicates a strided load and store. If we have
842+
// something else, it's a random load or store we can't handle.
843+
const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
844+
if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
845+
return false;
846+
const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
847+
if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
848+
return false;
849+
850+
// Reject memcpys that are so large that they overflow an unsigned.
851+
uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
852+
if ((SizeInBytes >> 32) != 0)
853+
return false;
854+
855+
// Check if the stride matches the size of the memcpy. If so, then we know
856+
// that every byte is touched in the loop.
857+
const SCEVConstant *ConstStride =
858+
dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
859+
if (!ConstStride)
860+
return false;
861+
862+
APInt Stride = ConstStride->getAPInt();
863+
if (SizeInBytes != Stride && SizeInBytes != -Stride)
864+
return false;
865+
866+
return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
867+
MCI->getDestAlign(), MCI->getSourceAlign(),
868+
MCI, MCI, StoreEv, LoadEv, BECount);
869+
}
870+
795871
/// processLoopMemSet - See if this memset can be promoted to a large memset.
796872
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
797873
const SCEV *BECount) {
@@ -800,7 +876,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
800876
return false;
801877

802878
// If we're not allowed to hack on memset, we fail.
803-
if (!HasMemset)
879+
if (!HasMemset || DisableLIRP::Memset)
804880
return false;
805881

806882
Value *Pointer = MSI->getDest();
@@ -1040,9 +1116,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
10401116
ORE.emit([&]() {
10411117
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
10421118
NewCall->getDebugLoc(), Preheader)
1043-
<< "Transformed loop-strided store into a call to "
1119+
<< "Transformed loop-strided store in "
1120+
<< ore::NV("Function", TheStore->getFunction())
1121+
<< " function into a call to "
10441122
<< ore::NV("NewFunction", NewCall->getCalledFunction())
1045-
<< "() function";
1123+
<< "() intrinsic";
10461124
});
10471125

10481126
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1068,20 +1146,25 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
10681146

10691147
Value *StorePtr = SI->getPointerOperand();
10701148
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
1071-
APInt Stride = getStoreStride(StoreEv);
10721149
unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
1073-
bool NegStride = StoreSize == -Stride;
10741150

10751151
// The store must be feeding a non-volatile load.
10761152
LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
10771153
assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
10781154

1079-
// See if the pointer expression is an AddRec like {base,+,1} on the current
1080-
// loop, which indicates a strided load. If we have something else, it's a
1081-
// random load we can't handle.
10821155
const SCEVAddRecExpr *LoadEv =
10831156
cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
1157+
Value *LoadPtr = LI->getPointerOperand();
1158+
return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
1159+
SI->getAlign(), LI->getAlign(), SI, LI,
1160+
StoreEv, LoadEv, BECount);
1161+
}
10841162

1163+
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1164+
Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
1165+
MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
1166+
const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
1167+
const SCEV *BECount) {
10851168
// The trip count of the loop and the base pointer of the addrec SCEV is
10861169
// guaranteed to be loop invariant, which means that it should dominate the
10871170
// header. This allows us to insert code for it in the preheader.
@@ -1093,9 +1176,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
10931176

10941177
bool Changed = false;
10951178
const SCEV *StrStart = StoreEv->getStart();
1096-
unsigned StrAS = SI->getPointerAddressSpace();
1179+
unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
10971180
Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
10981181

1182+
APInt Stride = getStoreStride(StoreEv);
1183+
bool NegStride = StoreSize == -Stride;
1184+
10991185
// Handle negative strided loops.
11001186
if (NegStride)
11011187
StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
@@ -1119,13 +1205,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
11191205
Changed = true;
11201206

11211207
SmallPtrSet<Instruction *, 1> Stores;
1122-
Stores.insert(SI);
1208+
Stores.insert(TheStore);
1209+
1210+
bool IsMemCpy = isa<MemCpyInst>(TheStore);
1211+
const std::string InstRemark = IsMemCpy ? "memcpy" : "load and store";
1212+
11231213
if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1124-
StoreSize, *AA, Stores))
1214+
StoreSize, *AA, Stores)) {
1215+
ORE.emit([&]() {
1216+
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
1217+
TheStore)
1218+
<< ore::NV("Inst", InstRemark) << " in "
1219+
<< ore::NV("Function", TheStore->getFunction())
1220+
<< " function will not be hoisted: "
1221+
<< ore::NV("Reason", "The loop may access store location");
1222+
});
11251223
return Changed;
1224+
}
11261225

11271226
const SCEV *LdStart = LoadEv->getStart();
1128-
unsigned LdAS = LI->getPointerAddressSpace();
1227+
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
11291228

11301229
// Handle negative strided loops.
11311230
if (NegStride)
@@ -1136,9 +1235,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
11361235
Value *LoadBasePtr = Expander.expandCodeFor(
11371236
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
11381237

1238+
// If the store is a memcpy instruction, we must check if it will write to
1239+
// the load memory locations. So remove it from the ignored stores.
1240+
if (IsMemCpy)
1241+
Stores.erase(TheStore);
11391242
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1140-
StoreSize, *AA, Stores))
1243+
StoreSize, *AA, Stores)) {
1244+
ORE.emit([&]() {
1245+
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1246+
<< ore::NV("Inst", InstRemark) << " in "
1247+
<< ore::NV("Function", TheStore->getFunction())
1248+
<< " function will not be hoisted: "
1249+
<< ore::NV("Reason", "The loop may access load location");
1250+
});
11411251
return Changed;
1252+
}
11421253

11431254
if (avoidLIRForMultiBlockLoop())
11441255
return Changed;
@@ -1155,15 +1266,15 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
11551266
// Check whether to generate an unordered atomic memcpy:
11561267
// If the load or store are atomic, then they must necessarily be unordered
11571268
// by previous checks.
1158-
if (!SI->isAtomic() && !LI->isAtomic())
1159-
NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
1160-
LI->getAlign(), NumBytes);
1269+
if (!TheStore->isAtomic() && !TheLoad->isAtomic())
1270+
NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
1271+
LoadAlign, NumBytes);
11611272
else {
11621273
// We cannot allow unaligned ops for unordered load/store, so reject
11631274
// anything where the alignment isn't at least the element size.
1164-
const Align StoreAlign = SI->getAlign();
1165-
const Align LoadAlign = LI->getAlign();
1166-
if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1275+
assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
1276+
"Expect unordered load/store to have align.");
1277+
if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
11671278
return Changed;
11681279

11691280
// If the element.atomic memcpy is not lowered into explicit
@@ -1177,10 +1288,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
11771288
// Note that unordered atomic loads/stores are *required* by the spec to
11781289
// have an alignment but non-atomic loads/stores may not.
11791290
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1180-
StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
1181-
StoreSize);
1291+
StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
1292+
NumBytes, StoreSize);
11821293
}
1183-
NewCall->setDebugLoc(SI->getDebugLoc());
1294+
NewCall->setDebugLoc(TheStore->getDebugLoc());
11841295

11851296
if (MSSAU) {
11861297
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
@@ -1189,23 +1300,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
11891300
}
11901301

11911302
LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
1192-
<< " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
1193-
<< " from store ptr=" << *StoreEv << " at: " << *SI
1303+
<< " from load ptr=" << *LoadEv << " at: " << *TheLoad
1304+
<< "\n"
1305+
<< " from store ptr=" << *StoreEv << " at: " << *TheStore
11941306
<< "\n");
11951307

11961308
ORE.emit([&]() {
11971309
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
11981310
NewCall->getDebugLoc(), Preheader)
11991311
<< "Formed a call to "
12001312
<< ore::NV("NewFunction", NewCall->getCalledFunction())
1201-
<< "() function";
1313+
<< "() intrinsic from " << ore::NV("Inst", InstRemark)
1314+
<< " instruction in " << ore::NV("Function", TheStore->getFunction())
1315+
<< " function";
12021316
});
12031317

12041318
// Okay, the memcpy has been formed. Zap the original store and anything that
12051319
// feeds into it.
12061320
if (MSSAU)
1207-
MSSAU->removeMemoryAccess(SI, true);
1208-
deleteDeadInstruction(SI);
1321+
MSSAU->removeMemoryAccess(TheStore, true);
1322+
deleteDeadInstruction(TheStore);
12091323
if (MSSAU && VerifyMemorySSA)
12101324
MSSAU->getMemorySSA()->verifyMemorySSA();
12111325
++NumMemCpy;

llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
66

77
; Check that everything still works when debuginfo is present, and that it is reasonably propagated.
88

9-
; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function
9+
; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function
1010

1111
define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
1212
; CHECK-LABEL: @test6_dest_align(

0 commit comments

Comments
 (0)