@@ -205,6 +205,13 @@ class LoopIdiomRecognize {
205
205
enum class ForMemset { No, Yes };
206
206
bool processLoopStores (SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
207
207
ForMemset For);
208
+
209
+ template <typename MemInst>
210
+ bool processLoopMemIntrinsic (
211
+ BasicBlock *BB,
212
+ bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
213
+ const SCEV *BECount);
214
+ bool processLoopMemCpy (MemCpyInst *MCI, const SCEV *BECount);
208
215
bool processLoopMemSet (MemSetInst *MSI, const SCEV *BECount);
209
216
210
217
bool processLoopStridedStore (Value *DestPtr, unsigned StoreSize,
@@ -214,6 +221,13 @@ class LoopIdiomRecognize {
214
221
const SCEVAddRecExpr *Ev, const SCEV *BECount,
215
222
bool NegStride, bool IsLoopMemset = false );
216
223
bool processLoopStoreOfLoopLoad (StoreInst *SI, const SCEV *BECount);
224
+ bool processLoopStoreOfLoopLoad (Value *DestPtr, Value *SourcePtr,
225
+ unsigned StoreSize, MaybeAlign StoreAlign,
226
+ MaybeAlign LoadAlign, Instruction *TheStore,
227
+ Instruction *TheLoad,
228
+ const SCEVAddRecExpr *StoreEv,
229
+ const SCEVAddRecExpr *LoadEv,
230
+ const SCEV *BECount);
217
231
bool avoidLIRForMultiBlockLoop (bool IsMemset = false ,
218
232
bool IsLoopMemset = false );
219
233
@@ -628,22 +642,10 @@ bool LoopIdiomRecognize::runOnLoopBlock(
628
642
for (auto &SI : StoreRefsForMemcpy)
629
643
MadeChange |= processLoopStoreOfLoopLoad (SI, BECount);
630
644
631
- for (BasicBlock::iterator I = BB->begin (), E = BB->end (); I != E;) {
632
- Instruction *Inst = &*I++;
633
- // Look for memset instructions, which may be optimized to a larger memset.
634
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
635
- WeakTrackingVH InstPtr (&*I);
636
- if (!processLoopMemSet (MSI, BECount))
637
- continue ;
638
- MadeChange = true ;
639
-
640
- // If processing the memset invalidated our iterator, start over from the
641
- // top of the block.
642
- if (!InstPtr)
643
- I = BB->begin ();
644
- continue ;
645
- }
646
- }
645
+ MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
646
+ BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
647
+ MadeChange |= processLoopMemIntrinsic<MemSetInst>(
648
+ BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
647
649
648
650
return MadeChange;
649
651
}
@@ -792,6 +794,80 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
792
794
return Changed;
793
795
}
794
796
797
+ // / processLoopMemIntrinsic - Template function for calling different processor
798
+ // / functions based on mem instrinsic type.
799
+ template <typename MemInst>
800
+ bool LoopIdiomRecognize::processLoopMemIntrinsic (
801
+ BasicBlock *BB,
802
+ bool (LoopIdiomRecognize::*Processor)(MemInst *, const SCEV *),
803
+ const SCEV *BECount) {
804
+ bool MadeChange = false ;
805
+ for (BasicBlock::iterator I = BB->begin (), E = BB->end (); I != E;) {
806
+ Instruction *Inst = &*I++;
807
+ // Look for memory instructions, which may be optimized to a larger one.
808
+ if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
809
+ WeakTrackingVH InstPtr (&*I);
810
+ if (!(this ->*Processor)(MI, BECount))
811
+ continue ;
812
+ MadeChange = true ;
813
+
814
+ // If processing the instruction invalidated our iterator, start over from
815
+ // the top of the block.
816
+ if (!InstPtr)
817
+ I = BB->begin ();
818
+ continue ;
819
+ }
820
+ }
821
+ return MadeChange;
822
+ }
823
+
824
+ // / processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
825
+ bool LoopIdiomRecognize::processLoopMemCpy (MemCpyInst *MCI,
826
+ const SCEV *BECount) {
827
+ // We can only handle non-volatile memcpys with a constant size.
828
+ if (MCI->isVolatile () || !isa<ConstantInt>(MCI->getLength ()))
829
+ return false ;
830
+
831
+ // If we're not allowed to hack on memcpy, we fail.
832
+ if (!HasMemcpy || DisableLIRP::Memcpy)
833
+ return false ;
834
+
835
+ Value *Dest = MCI->getDest ();
836
+ Value *Source = MCI->getSource ();
837
+ if (!Dest || !Source)
838
+ return false ;
839
+
840
+ // See if the load and store pointer expressions are AddRec like {base,+,1} on
841
+ // the current loop, which indicates a strided load and store. If we have
842
+ // something else, it's a random load or store we can't handle.
843
+ const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV (Dest));
844
+ if (!StoreEv || StoreEv->getLoop () != CurLoop || !StoreEv->isAffine ())
845
+ return false ;
846
+ const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV (Source));
847
+ if (!LoadEv || LoadEv->getLoop () != CurLoop || !LoadEv->isAffine ())
848
+ return false ;
849
+
850
+ // Reject memcpys that are so large that they overflow an unsigned.
851
+ uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength ())->getZExtValue ();
852
+ if ((SizeInBytes >> 32 ) != 0 )
853
+ return false ;
854
+
855
+ // Check if the stride matches the size of the memcpy. If so, then we know
856
+ // that every byte is touched in the loop.
857
+ const SCEVConstant *ConstStride =
858
+ dyn_cast<SCEVConstant>(StoreEv->getOperand (1 ));
859
+ if (!ConstStride)
860
+ return false ;
861
+
862
+ APInt Stride = ConstStride->getAPInt ();
863
+ if (SizeInBytes != Stride && SizeInBytes != -Stride)
864
+ return false ;
865
+
866
+ return processLoopStoreOfLoopLoad (Dest, Source, (unsigned )SizeInBytes,
867
+ MCI->getDestAlign (), MCI->getSourceAlign (),
868
+ MCI, MCI, StoreEv, LoadEv, BECount);
869
+ }
870
+
795
871
// / processLoopMemSet - See if this memset can be promoted to a large memset.
796
872
bool LoopIdiomRecognize::processLoopMemSet (MemSetInst *MSI,
797
873
const SCEV *BECount) {
@@ -800,7 +876,7 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
800
876
return false ;
801
877
802
878
// If we're not allowed to hack on memset, we fail.
803
- if (!HasMemset)
879
+ if (!HasMemset || DisableLIRP::Memset )
804
880
return false ;
805
881
806
882
Value *Pointer = MSI->getDest ();
@@ -1040,9 +1116,11 @@ bool LoopIdiomRecognize::processLoopStridedStore(
1040
1116
ORE.emit ([&]() {
1041
1117
return OptimizationRemark (DEBUG_TYPE, " ProcessLoopStridedStore" ,
1042
1118
NewCall->getDebugLoc (), Preheader)
1043
- << " Transformed loop-strided store into a call to "
1119
+ << " Transformed loop-strided store in "
1120
+ << ore::NV (" Function" , TheStore->getFunction ())
1121
+ << " function into a call to "
1044
1122
<< ore::NV (" NewFunction" , NewCall->getCalledFunction ())
1045
- << " () function " ;
1123
+ << " () intrinsic " ;
1046
1124
});
1047
1125
1048
1126
// Okay, the memset has been formed. Zap the original store and anything that
@@ -1068,20 +1146,25 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1068
1146
1069
1147
Value *StorePtr = SI->getPointerOperand ();
1070
1148
const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV (StorePtr));
1071
- APInt Stride = getStoreStride (StoreEv);
1072
1149
unsigned StoreSize = DL->getTypeStoreSize (SI->getValueOperand ()->getType ());
1073
- bool NegStride = StoreSize == -Stride;
1074
1150
1075
1151
// The store must be feeding a non-volatile load.
1076
1152
LoadInst *LI = cast<LoadInst>(SI->getValueOperand ());
1077
1153
assert (LI->isUnordered () && " Expected only non-volatile non-ordered loads." );
1078
1154
1079
- // See if the pointer expression is an AddRec like {base,+,1} on the current
1080
- // loop, which indicates a strided load. If we have something else, it's a
1081
- // random load we can't handle.
1082
1155
const SCEVAddRecExpr *LoadEv =
1083
1156
cast<SCEVAddRecExpr>(SE->getSCEV (LI->getPointerOperand ()));
1157
+ Value *LoadPtr = LI->getPointerOperand ();
1158
+ return processLoopStoreOfLoopLoad (StorePtr, LoadPtr, StoreSize,
1159
+ SI->getAlign (), LI->getAlign (), SI, LI,
1160
+ StoreEv, LoadEv, BECount);
1161
+ }
1084
1162
1163
+ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad (
1164
+ Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
1165
+ MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
1166
+ const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
1167
+ const SCEV *BECount) {
1085
1168
// The trip count of the loop and the base pointer of the addrec SCEV is
1086
1169
// guaranteed to be loop invariant, which means that it should dominate the
1087
1170
// header. This allows us to insert code for it in the preheader.
@@ -1093,9 +1176,12 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1093
1176
1094
1177
bool Changed = false ;
1095
1178
const SCEV *StrStart = StoreEv->getStart ();
1096
- unsigned StrAS = SI ->getPointerAddressSpace ();
1179
+ unsigned StrAS = DestPtr-> getType () ->getPointerAddressSpace ();
1097
1180
Type *IntIdxTy = Builder.getIntNTy (DL->getIndexSizeInBits (StrAS));
1098
1181
1182
+ APInt Stride = getStoreStride (StoreEv);
1183
+ bool NegStride = StoreSize == -Stride;
1184
+
1099
1185
// Handle negative strided loops.
1100
1186
if (NegStride)
1101
1187
StrStart = getStartForNegStride (StrStart, BECount, IntIdxTy, StoreSize, SE);
@@ -1119,13 +1205,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1119
1205
Changed = true ;
1120
1206
1121
1207
SmallPtrSet<Instruction *, 1 > Stores;
1122
- Stores.insert (SI);
1208
+ Stores.insert (TheStore);
1209
+
1210
+ bool IsMemCpy = isa<MemCpyInst>(TheStore);
1211
+ const std::string InstRemark = IsMemCpy ? " memcpy" : " load and store" ;
1212
+
1123
1213
if (mayLoopAccessLocation (StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
1124
- StoreSize, *AA, Stores))
1214
+ StoreSize, *AA, Stores)) {
1215
+ ORE.emit ([&]() {
1216
+ return OptimizationRemarkMissed (DEBUG_TYPE, " LoopMayAccessStore" ,
1217
+ TheStore)
1218
+ << ore::NV (" Inst" , InstRemark) << " in "
1219
+ << ore::NV (" Function" , TheStore->getFunction ())
1220
+ << " function will not be hoisted: "
1221
+ << ore::NV (" Reason" , " The loop may access store location" );
1222
+ });
1125
1223
return Changed;
1224
+ }
1126
1225
1127
1226
const SCEV *LdStart = LoadEv->getStart ();
1128
- unsigned LdAS = LI ->getPointerAddressSpace ();
1227
+ unsigned LdAS = SourcePtr-> getType () ->getPointerAddressSpace ();
1129
1228
1130
1229
// Handle negative strided loops.
1131
1230
if (NegStride)
@@ -1136,9 +1235,21 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1136
1235
Value *LoadBasePtr = Expander.expandCodeFor (
1137
1236
LdStart, Builder.getInt8PtrTy (LdAS), Preheader->getTerminator ());
1138
1237
1238
+ // If the store is a memcpy instruction, we must check if it will write to
1239
+ // the load memory locations. So remove it from the ignored stores.
1240
+ if (IsMemCpy)
1241
+ Stores.erase (TheStore);
1139
1242
if (mayLoopAccessLocation (LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
1140
- StoreSize, *AA, Stores))
1243
+ StoreSize, *AA, Stores)) {
1244
+ ORE.emit ([&]() {
1245
+ return OptimizationRemarkMissed (DEBUG_TYPE, " LoopMayAccessLoad" , TheLoad)
1246
+ << ore::NV (" Inst" , InstRemark) << " in "
1247
+ << ore::NV (" Function" , TheStore->getFunction ())
1248
+ << " function will not be hoisted: "
1249
+ << ore::NV (" Reason" , " The loop may access load location" );
1250
+ });
1141
1251
return Changed;
1252
+ }
1142
1253
1143
1254
if (avoidLIRForMultiBlockLoop ())
1144
1255
return Changed;
@@ -1155,15 +1266,15 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1155
1266
// Check whether to generate an unordered atomic memcpy:
1156
1267
// If the load or store are atomic, then they must necessarily be unordered
1157
1268
// by previous checks.
1158
- if (!SI ->isAtomic () && !LI ->isAtomic ())
1159
- NewCall = Builder.CreateMemCpy (StoreBasePtr, SI-> getAlign () , LoadBasePtr,
1160
- LI-> getAlign () , NumBytes);
1269
+ if (!TheStore ->isAtomic () && !TheLoad ->isAtomic ())
1270
+ NewCall = Builder.CreateMemCpy (StoreBasePtr, StoreAlign , LoadBasePtr,
1271
+ LoadAlign , NumBytes);
1161
1272
else {
1162
1273
// We cannot allow unaligned ops for unordered load/store, so reject
1163
1274
// anything where the alignment isn't at least the element size.
1164
- const Align StoreAlign = SI-> getAlign ();
1165
- const Align LoadAlign = LI-> getAlign ( );
1166
- if (StoreAlign < StoreSize || LoadAlign < StoreSize)
1275
+ assert (( StoreAlign. hasValue () && LoadAlign. hasValue ()) &&
1276
+ " Expect unordered load/store to have align. " );
1277
+ if (StoreAlign. getValue () < StoreSize || LoadAlign. getValue () < StoreSize)
1167
1278
return Changed;
1168
1279
1169
1280
// If the element.atomic memcpy is not lowered into explicit
@@ -1177,10 +1288,10 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1177
1288
// Note that unordered atomic loads/stores are *required* by the spec to
1178
1289
// have an alignment but non-atomic loads/stores may not.
1179
1290
NewCall = Builder.CreateElementUnorderedAtomicMemCpy (
1180
- StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes ,
1181
- StoreSize);
1291
+ StoreBasePtr, StoreAlign. getValue () , LoadBasePtr, LoadAlign. getValue () ,
1292
+ NumBytes, StoreSize);
1182
1293
}
1183
- NewCall->setDebugLoc (SI ->getDebugLoc ());
1294
+ NewCall->setDebugLoc (TheStore ->getDebugLoc ());
1184
1295
1185
1296
if (MSSAU) {
1186
1297
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB (
@@ -1189,23 +1300,26 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1189
1300
}
1190
1301
1191
1302
LLVM_DEBUG (dbgs () << " Formed memcpy: " << *NewCall << " \n "
1192
- << " from load ptr=" << *LoadEv << " at: " << *LI << " \n "
1193
- << " from store ptr=" << *StoreEv << " at: " << *SI
1303
+ << " from load ptr=" << *LoadEv << " at: " << *TheLoad
1304
+ << " \n "
1305
+ << " from store ptr=" << *StoreEv << " at: " << *TheStore
1194
1306
<< " \n " );
1195
1307
1196
1308
ORE.emit ([&]() {
1197
1309
return OptimizationRemark (DEBUG_TYPE, " ProcessLoopStoreOfLoopLoad" ,
1198
1310
NewCall->getDebugLoc (), Preheader)
1199
1311
<< " Formed a call to "
1200
1312
<< ore::NV (" NewFunction" , NewCall->getCalledFunction ())
1201
- << " () function" ;
1313
+ << " () intrinsic from " << ore::NV (" Inst" , InstRemark)
1314
+ << " instruction in " << ore::NV (" Function" , TheStore->getFunction ())
1315
+ << " function" ;
1202
1316
});
1203
1317
1204
1318
// Okay, the memcpy has been formed. Zap the original store and anything that
1205
1319
// feeds into it.
1206
1320
if (MSSAU)
1207
- MSSAU->removeMemoryAccess (SI , true );
1208
- deleteDeadInstruction (SI );
1321
+ MSSAU->removeMemoryAccess (TheStore , true );
1322
+ deleteDeadInstruction (TheStore );
1209
1323
if (MSSAU && VerifyMemorySSA)
1210
1324
MSSAU->getMemorySSA ()->verifyMemorySSA ();
1211
1325
++NumMemCpy;
0 commit comments