Skip to content

Commit 6c64c8a

Browse files
authored
[NVPTX] add an optional early copy of byval arguments (#113384)
byval arguments in NVPTX are special. We're only allowed to read from them using a special instruction, and if we ever need to write to them or take an address, we must make a local copy and use it, instead. The problem is that local copies are very expensive, and we create them very late in the compilation pipeline, so LLVM does not have much of a chance to eliminate them, if they turn out to be unnecessary. One way around that is to create such copies early on, and let them percolate through the optimizations. The copying itself will never trigger creation of another copy later on, as the reads are allowed. If LLVM can eliminate it, it's a win. It the full optimization pipeline can't remove the copy, that's as good as it gets in terms of the effort we could've done, and it's certainly a much better effort than what we do now. This early injection of the copies has potential to create undesireable side-effects, so it's disabled by default, for now, until it sees more testing.
1 parent 789fdd5 commit 6c64c8a

File tree

5 files changed

+569
-173
lines changed

5 files changed

+569
-173
lines changed

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ struct GenericToNVVMPass : PassInfoMixin<GenericToNVVMPass> {
7070
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
7171
};
7272

73+
struct NVPTXCopyByValArgsPass : PassInfoMixin<NVPTXCopyByValArgsPass> {
74+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
75+
};
76+
7377
namespace NVPTX {
7478
enum DrvInterface {
7579
NVCL,

llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,33 @@ struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
543543
PI.setAborted(&II);
544544
}
545545
}; // struct ArgUseChecker
546+
547+
void copyByValParam(Function &F, Argument &Arg) {
548+
LLVM_DEBUG(dbgs() << "Creating a local copy of " << Arg << "\n");
549+
// Otherwise we have to create a temporary copy.
550+
BasicBlock::iterator FirstInst = F.getEntryBlock().begin();
551+
Type *StructType = Arg.getParamByValType();
552+
const DataLayout &DL = F.getDataLayout();
553+
AllocaInst *AllocA = new AllocaInst(StructType, DL.getAllocaAddrSpace(),
554+
Arg.getName(), FirstInst);
555+
// Set the alignment to alignment of the byval parameter. This is because,
556+
// later load/stores assume that alignment, and we are going to replace
557+
// the use of the byval parameter with this alloca instruction.
558+
AllocA->setAlignment(F.getParamAlign(Arg.getArgNo())
559+
.value_or(DL.getPrefTypeAlign(StructType)));
560+
Arg.replaceAllUsesWith(AllocA);
561+
562+
Value *ArgInParam = new AddrSpaceCastInst(
563+
&Arg, PointerType::get(Arg.getContext(), ADDRESS_SPACE_PARAM),
564+
Arg.getName(), FirstInst);
565+
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
566+
// addrspacecast preserves alignment. Since params are constant, this load
567+
// is definitely not volatile.
568+
const auto ArgSize = *AllocA->getAllocationSize(DL);
569+
IRBuilder<> IRB(&*FirstInst);
570+
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
571+
ArgSize);
572+
}
546573
} // namespace
547574

548575
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
@@ -558,7 +585,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
558585

559586
ArgUseChecker AUC(DL, IsGridConstant);
560587
ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
561-
bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
588+
bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted());
562589
// Easy case, accessing parameter directly is fine.
563590
if (ArgUseIsReadOnly && AUC.Conditionals.empty()) {
564591
// Convert all loads and intermediate operations to use parameter AS and
@@ -587,7 +614,6 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
587614
// However, we're still not allowed to write to it. If the user specified
588615
// `__grid_constant__` for the argument, we'll consider escaped pointer as
589616
// read-only.
590-
unsigned AS = DL.getAllocaAddrSpace();
591617
if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) {
592618
LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
593619
// Replace all argument pointer uses (which might include a device function
@@ -612,29 +638,8 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
612638

613639
// Do not replace Arg in the cast to param space
614640
CastToParam->setOperand(0, Arg);
615-
} else {
616-
LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
617-
// Otherwise we have to create a temporary copy.
618-
AllocaInst *AllocA =
619-
new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
620-
// Set the alignment to alignment of the byval parameter. This is because,
621-
// later load/stores assume that alignment, and we are going to replace
622-
// the use of the byval parameter with this alloca instruction.
623-
AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
624-
.value_or(DL.getPrefTypeAlign(StructType)));
625-
Arg->replaceAllUsesWith(AllocA);
626-
627-
Value *ArgInParam = new AddrSpaceCastInst(
628-
Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
629-
Arg->getName(), FirstInst);
630-
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
631-
// addrspacecast preserves alignment. Since params are constant, this load
632-
// is definitely not volatile.
633-
const auto ArgSize = *AllocA->getAllocationSize(DL);
634-
IRBuilder<> IRB(&*FirstInst);
635-
IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(),
636-
ArgSize);
637-
}
641+
} else
642+
copyByValParam(*Func, *Arg);
638643
}
639644

640645
void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
@@ -734,3 +739,22 @@ bool NVPTXLowerArgs::runOnFunction(Function &F) {
734739
}
735740

736741
FunctionPass *llvm::createNVPTXLowerArgsPass() { return new NVPTXLowerArgs(); }
742+
743+
static bool copyFunctionByValArgs(Function &F) {
744+
LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName()
745+
<< "\n");
746+
bool Changed = false;
747+
for (Argument &Arg : F.args())
748+
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() &&
749+
!(isParamGridConstant(Arg) && isKernelFunction(F))) {
750+
copyByValParam(F, Arg);
751+
Changed = true;
752+
}
753+
return Changed;
754+
}
755+
756+
PreservedAnalyses NVPTXCopyByValArgsPass::run(Function &F,
757+
FunctionAnalysisManager &AM) {
758+
return copyFunctionByValArgs(F) ? PreservedAnalyses::none()
759+
: PreservedAnalyses::all();
760+
}

llvm/lib/Target/NVPTX/NVPTXPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,5 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
3737
#endif
3838
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
3939
FUNCTION_PASS("nvvm-reflect", NVVMReflectPass())
40+
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
4041
#undef FUNCTION_PASS

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,29 @@ static cl::opt<bool> UseShortPointersOpt(
6464
"Use 32-bit pointers for accessing const/local/shared address spaces."),
6565
cl::init(false), cl::Hidden);
6666

67+
// byval arguments in NVPTX are special. We're only allowed to read from them
68+
// using a special instruction, and if we ever need to write to them or take an
69+
// address, we must make a local copy and use it, instead.
70+
//
71+
// The problem is that local copies are very expensive, and we create them very
72+
// late in the compilation pipeline, so LLVM does not have much of a chance to
73+
// eliminate them, if they turn out to be unnecessary.
74+
//
75+
// One way around that is to create such copies early on, and let them percolate
76+
// through the optimizations. The copying itself will never trigger creation of
77+
// another copy later on, as the reads are allowed. If LLVM can eliminate it,
78+
// it's a win. It the full optimization pipeline can't remove the copy, that's
79+
// as good as it gets in terms of the effort we could've done, and it's
80+
// certainly a much better effort than what we do now.
81+
//
82+
// This early injection of the copies has potential to create undesireable
83+
// side-effects, so it's disabled by default, for now, until it sees more
84+
// testing.
85+
static cl::opt<bool> EarlyByValArgsCopy(
86+
"nvptx-early-byval-copy",
87+
cl::desc("Create a copy of byval function arguments early."),
88+
cl::init(false), cl::Hidden);
89+
6790
namespace llvm {
6891

6992
void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
@@ -236,6 +259,8 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
236259
// Note: NVVMIntrRangePass was causing numerical discrepancies at one
237260
// point, if issues crop up, consider disabling.
238261
FPM.addPass(NVVMIntrRangePass());
262+
if (EarlyByValArgsCopy)
263+
FPM.addPass(NVPTXCopyByValArgsPass());
239264
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
240265
});
241266
}

0 commit comments

Comments
 (0)