Skip to content

Commit 82f99fa

Browse files
committed
[LoopIdiom] Initial support for generating memset_pattern intrinsic (disabled by default)
When not disabled, produce the memset_pattern intrinsic that was introduced in llvm#97583.
1 parent 627f1ef commit 82f99fa

File tree

2 files changed

+230
-21
lines changed

2 files changed

+230
-21
lines changed

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Lines changed: 89 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
133133
"with -Os/-Oz"),
134134
cl::init(true), cl::Hidden);
135135

136+
static cl::opt<bool> EnableMemsetPatternIntrinsic(
137+
"loop-idiom-enable-memset-pattern-intrinsic",
138+
cl::desc("Enable use of the memset_pattern intrinsic."), cl::init(false),
139+
cl::Hidden);
140+
136141
namespace {
137142

138143
class LoopIdiomRecognize {
@@ -300,7 +305,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
300305
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
301306
HasMemcpy = TLI->has(LibFunc_memcpy);
302307

303-
if (HasMemset || HasMemsetPattern || HasMemcpy)
308+
if (HasMemset || HasMemsetPattern || EnableMemsetPatternIntrinsic ||
309+
HasMemcpy)
304310
if (SE->hasLoopInvariantBackedgeTakenCount(L))
305311
return runOnCountableLoop();
306312

@@ -457,7 +463,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
457463
// It looks like we can use SplatValue.
458464
return LegalStoreKind::Memset;
459465
}
460-
if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
466+
if (!UnorderedAtomic && (HasMemsetPattern || EnableMemsetPatternIntrinsic) &&
467+
!DisableLIRP::Memset &&
461468
// Don't create memset_pattern16s with address spaces.
462469
StorePtr->getType()->getPointerAddressSpace() == 0 &&
463470
getMemSetPatternValue(StoredVal, DL)) {
@@ -993,6 +1000,46 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
9931000
SCEV::FlagNUW);
9941001
}
9951002

1003+
ConstantInt *memSetPatternValueToI128ConstantInt(LLVMContext &Context,
1004+
Value *MemSetPatternValue) {
1005+
if (auto CIMemSetPatternValue = dyn_cast<ConstantInt>(MemSetPatternValue)) {
1006+
return CIMemSetPatternValue;
1007+
}
1008+
1009+
if (auto Array = dyn_cast<ConstantDataArray>(MemSetPatternValue)) {
1010+
Type *ElementType = Array->getElementType();
1011+
unsigned ElementSize = Array->getElementByteSize() * 8;
1012+
1013+
APInt Result(128, 0);
1014+
unsigned totalBits = 0;
1015+
1016+
for (unsigned i = 0; i < Array->getNumElements(); ++i) {
1017+
if (totalBits + ElementSize > 128) {
1018+
report_fatal_error("Pattern value unexpectedly greater than 128 bits");
1019+
}
1020+
1021+
APInt ElementBits;
1022+
if (ElementType->isIntegerTy()) {
1023+
ElementBits = Array->getElementAsAPInt(i);
1024+
} else if (ElementType->isFloatingPointTy()) {
1025+
APFloat APF = Array->getElementAsAPFloat(i);
1026+
ElementBits = APF.bitcastToAPInt();
1027+
} else {
1028+
llvm_unreachable("Unexpected element type");
1029+
}
1030+
1031+
// Shift the existing result left by the element's size and OR in the new
1032+
// value
1033+
Result = (Result << ElementSize) | ElementBits.zextOrTrunc(128);
1034+
totalBits += ElementSize;
1035+
}
1036+
1037+
// Create and return a ConstantInt with the resulting value
1038+
return ConstantInt::get(Context, Result);
1039+
}
1040+
report_fatal_error("Encountered unrecognised type");
1041+
}
1042+
9961043
/// processLoopStridedStore - We see a strided store of some value. If we can
9971044
/// transform this into a memset or memset_pattern in the loop preheader, do so.
9981045
bool LoopIdiomRecognize::processLoopStridedStore(
@@ -1070,7 +1117,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
10701117
Value *NumBytes =
10711118
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
10721119

1073-
if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
1120+
if (!SplatValue && !(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
1121+
EnableMemsetPatternIntrinsic))
10741122
return Changed;
10751123

10761124
AAMDNodes AATags = TheStore->getAAMetadata();
@@ -1087,24 +1135,44 @@ bool LoopIdiomRecognize::processLoopStridedStore(
10871135
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
10881136
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
10891137
} else {
1090-
assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
1091-
// Everything is emitted in default address space
1092-
Type *Int8PtrTy = DestInt8PtrTy;
1093-
1094-
StringRef FuncName = "memset_pattern16";
1095-
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1096-
Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
1097-
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1098-
1099-
// Otherwise we should form a memset_pattern16. PatternValue is known to be
1100-
// an constant array of 16-bytes. Plop the value into a mergable global.
1101-
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1102-
GlobalValue::PrivateLinkage,
1103-
PatternValue, ".memset_pattern");
1104-
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1105-
GV->setAlignment(Align(16));
1106-
Value *PatternPtr = GV;
1107-
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1138+
assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16) ||
1139+
EnableMemsetPatternIntrinsic);
1140+
if (EnableMemsetPatternIntrinsic) {
1141+
// Everything is emitted in default address space
1142+
1143+
// Get or insert the intrinsic declaration
1144+
Function *MemsetPatternIntrinsic = Intrinsic::getDeclaration(
1145+
M, Intrinsic::memset_pattern,
1146+
{DestInt8PtrTy, Builder.getInt128Ty(), Builder.getInt64Ty()});
1147+
1148+
// Create the call to the intrinsic
1149+
NewCall = Builder.CreateCall(
1150+
MemsetPatternIntrinsic,
1151+
{BasePtr,
1152+
memSetPatternValueToI128ConstantInt(M->getContext(), PatternValue),
1153+
NumBytes, ConstantInt::getFalse(M->getContext())});
1154+
} else {
1155+
// Everything is emitted in default address space
1156+
Type *Int8PtrTy = DestInt8PtrTy;
1157+
1158+
StringRef FuncName = "memset_pattern16";
1159+
FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
1160+
Builder.getVoidTy(), Int8PtrTy,
1161+
Int8PtrTy, IntIdxTy);
1162+
inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
1163+
1164+
// Otherwise we should form a memset_pattern16. PatternValue is known to
1165+
// be an constant array of 16-bytes. Plop the value into a mergable
1166+
// global.
1167+
GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
1168+
GlobalValue::PrivateLinkage,
1169+
PatternValue, ".memset_pattern");
1170+
GV->setUnnamedAddr(
1171+
GlobalValue::UnnamedAddr::Global); // Ok to merge these.
1172+
GV->setAlignment(Align(16));
1173+
Value *PatternPtr = GV;
1174+
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
1175+
}
11081176

11091177
// Set the TBAA info if present.
11101178
if (AATags.TBAA)
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -passes="loop-idiom" -loop-idiom-enable-memset-pattern-intrinsic < %s -S | FileCheck %s
3+
4+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
5+
6+
target triple = "x86_64-apple-darwin10.0.0"
7+
8+
9+
define dso_local void @double_memset(ptr nocapture %p) {
10+
; CHECK-LABEL: @double_memset(
11+
; CHECK-NEXT: entry:
12+
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA0:![0-9]+]]
13+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
14+
; CHECK: for.cond.cleanup:
15+
; CHECK-NEXT: ret void
16+
; CHECK: for.body:
17+
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
18+
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
19+
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
20+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
21+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
22+
;
23+
entry:
24+
br label %for.body
25+
26+
for.cond.cleanup:
27+
ret void
28+
29+
for.body:
30+
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
31+
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
32+
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !5
33+
%inc = add nuw nsw i64 %i.07, 1
34+
%exitcond.not = icmp eq i64 %inc, 16
35+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
36+
}
37+
38+
39+
define dso_local void @struct_memset(ptr nocapture %p) {
40+
; CHECK-LABEL: @struct_memset(
41+
; CHECK-NEXT: entry:
42+
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 128, i1 false), !tbaa [[TBAA4:![0-9]+]]
43+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
44+
; CHECK: for.cond.cleanup:
45+
; CHECK-NEXT: ret void
46+
; CHECK: for.body:
47+
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
48+
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
49+
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
50+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
51+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
52+
;
53+
entry:
54+
br label %for.body
55+
56+
for.cond.cleanup:
57+
ret void
58+
59+
for.body:
60+
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
61+
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
62+
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10
63+
%inc = add nuw nsw i64 %i.07, 1
64+
%exitcond.not = icmp eq i64 %inc, 16
65+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
66+
}
67+
68+
define dso_local void @var_memset(ptr nocapture %p, i64 %len) {
69+
; CHECK-LABEL: @var_memset(
70+
; CHECK-NEXT: entry:
71+
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i64 [[LEN:%.*]], 3
72+
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 85118011523600494056561698149391631982, i64 [[TMP0]], i1 false)
73+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
74+
; CHECK: for.cond.cleanup:
75+
; CHECK-NEXT: ret void
76+
; CHECK: for.body:
77+
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
78+
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
79+
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
80+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]]
81+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
82+
;
83+
entry:
84+
br label %for.body
85+
86+
for.cond.cleanup:
87+
ret void
88+
89+
for.body:
90+
%i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
91+
%ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
92+
store double 3.14159e+00, ptr %ptr1, align 1, !tbaa !10
93+
%inc = add nuw nsw i64 %i.07, 1
94+
%exitcond.not = icmp eq i64 %inc, %len
95+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
96+
}
97+
98+
define void @test11_pattern(ptr nocapture %P) nounwind ssp {
99+
; CHECK-LABEL: @test11_pattern(
100+
; CHECK-NEXT: entry:
101+
; CHECK-NEXT: call void @llvm.memset_pattern.p0.i128.i64(ptr [[P:%.*]], i128 79228162532711081671548469249, i64 40000, i1 false)
102+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
103+
; CHECK: for.body:
104+
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
105+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[P]], i64 [[INDVAR]]
106+
; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
107+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 10000
108+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
109+
; CHECK: for.end:
110+
; CHECK-NEXT: ret void
111+
;
112+
entry:
113+
br label %for.body
114+
115+
for.body: ; preds = %entry, %for.body
116+
%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %for.body ]
117+
%arrayidx = getelementptr i32, ptr %P, i64 %indvar
118+
store i32 1, ptr %arrayidx, align 4
119+
%indvar.next = add i64 %indvar, 1
120+
%exitcond = icmp eq i64 %indvar.next, 10000
121+
br i1 %exitcond, label %for.end, label %for.body
122+
123+
for.end: ; preds = %for.body
124+
ret void
125+
}
126+
127+
!5 = !{!6, !6, i64 0}
128+
!6 = !{!"double", !7, i64 0}
129+
!7 = !{!"omnipotent char", !8, i64 0}
130+
!8 = !{!"Simple C++ TBAA"}
131+
132+
!15 = !{!8, i64 0, !"omnipotent char"}
133+
!17 = !{!15, i64 8, !"double"}
134+
!9 = !{!15, i64 32, !"_ZTS1A", !17, i64 0, i64 8, !17, i64 8, i64 8, !17, i64 16, i64 8, !17, i64 24, i64 8}
135+
!10 = !{!9, !17, i64 0, i64 1}
136+
137+
!18 = !{!19, !20, i64 0}
138+
!19 = !{!"A", !20, i64 0, !22, i64 8}
139+
!20 = !{!"any pointer", !7, i64 0}
140+
!21 = !{!22, !20, i64 0}
141+
!22 = !{!"B", !20, i64 0}

0 commit comments

Comments
 (0)