diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index e63c7a61c6f26..d5dffb8998a04 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -12,8 +12,7 @@ // http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces // // Kernel parameters are read-only and accessible only via ld.param -// instruction, directly or via a pointer. Pointers to kernel -// arguments can't be converted to generic address space. +// instruction, directly or via a pointer. // // Device function parameters are directly accessible via // ld.param/st.param, but taking the address of one returns a pointer @@ -54,8 +53,10 @@ // ... // } // -// 2. Convert pointers in a byval kernel parameter to pointers in the global -// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., +// 2. Convert byval kernel parameters to pointers in the param address space +// (so that NVPTX emits ld/st.param). Convert pointers *within* a byval +// kernel parameter to pointers in the global address space. This allows +// NVPTX to emit ld/st.global. // // struct S { // int *x; @@ -68,22 +69,68 @@ // // "b" points to the global address space. In the IR level, // -// define void @foo({i32*, i32*}* byval %input) { -// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 -// %b = load i32*, i32** %b_ptr +// define void @foo(ptr byval %input) { +// %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1 +// %b = load ptr, ptr %b_ptr // ; use %b // } // // becomes // // define void @foo({i32*, i32*}* byval %input) { -// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 -// %b = load i32*, i32** %b_ptr -// %b_global = addrspacecast i32* %b to i32 addrspace(1)* -// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* +// %b_param = addrspacecat ptr %input to ptr addrspace(101) +// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1 +// %b = load ptr, ptr addrspace(101) %b_ptr +// %b_global = addrspacecast ptr %b to ptr addrspace(1) // ; use %b_generic // } // +// Create a local copy of kernel byval parameters used in a way that *might* mutate +// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters +// are undefined behaviour, and don't require local copies. +// +// define void @foo(ptr byval(%struct.s) align 4 %input) { +// store i32 42, ptr %input +// ret void +// } +// +// becomes +// +// define void @foo(ptr byval(%struct.s) align 4 %input) #1 { +// %input1 = alloca %struct.s, align 4 +// %input2 = addrspacecast ptr %input to ptr addrspace(101) +// %input3 = load %struct.s, ptr addrspace(101) %input2, align 4 +// store %struct.s %input3, ptr %input1, align 4 +// store i32 42, ptr %input1, align 4 +// ret void +// } +// +// If %input were passed to a device function, or written to memory, +// conservatively assume that %input gets mutated, and create a local copy. +// +// Convert param pointers to grid_constant byval kernel parameters that are +// passed into calls (device functions, intrinsics, inline asm), or otherwise +// "escape" (into stores/ptrtoints) to the generic address space, using the +// `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param +// (available for sm70+) +// +// define void @foo(ptr byval(%struct.s) %input) { +// ; %input is a grid_constant +// %call = call i32 @escape(ptr %input) +// ret void +// } +// +// becomes +// +// define void @foo(ptr byval(%struct.s) %input) { +// %input1 = addrspacecast ptr %input to ptr addrspace(101) +// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast +// ; to prevent generic -> param -> generic from getting cancelled out +// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) +// %call = call i32 @escape(ptr %input1.gen) +// ret void +// } +// // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't // cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// @@ -166,19 +213,22 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args", // ones in parameter AS, so we can access them using ld.param. // ============================================================================= -// Replaces the \p OldUser instruction with the same in parameter AS. -// Only Load and GEP are supported. -static void convertToParamAS(Value *OldUser, Value *Param) { - Instruction *I = dyn_cast(OldUser); - assert(I && "OldUser must be an instruction"); +// For Loads, replaces the \p OldUse of the pointer with a Use of the same +// pointer in parameter AS. +// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to +// generic using cvta.param. +static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { + Instruction *I = dyn_cast(OldUse->getUser()); + assert(I && "OldUse must be in an instruction"); struct IP { + Use *OldUse; Instruction *OldInstruction; Value *NewParam; }; - SmallVector ItemsToConvert = {{I, Param}}; + SmallVector ItemsToConvert = {{OldUse, I, Param}}; SmallVector InstructionsToDelete; - auto CloneInstInParamAS = [](const IP &I) -> Value * { + auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * { if (auto *LI = dyn_cast(I.OldInstruction)) { LI->setOperand(0, I.NewParam); return LI; @@ -202,6 +252,43 @@ static void convertToParamAS(Value *OldUser, Value *Param) { // Just pass through the argument, the old ASC is no longer needed. return I.NewParam; } + + if (GridConstant) { + auto GetParamAddrCastToGeneric = + [](Value *Addr, Instruction *OriginalUser) -> Value * { + PointerType *ReturnTy = + PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC); + Function *CvtToGen = Intrinsic::getDeclaration( + OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen, + {ReturnTy, PointerType::get(OriginalUser->getContext(), + ADDRESS_SPACE_PARAM)}); + + // Cast param address to generic address space + Value *CvtToGenCall = + CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen", + OriginalUser->getIterator()); + return CvtToGenCall; + }; + + if (auto *CI = dyn_cast(I.OldInstruction)) { + I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI)); + return CI; + } + if (auto *SI = dyn_cast(I.OldInstruction)) { + // byval address is being stored, cast it to generic + if (SI->getValueOperand() == I.OldUse->get()) + SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI)); + return SI; + } + if (auto *PI = dyn_cast(I.OldInstruction)) { + if (PI->getPointerOperand() == I.OldUse->get()) + PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI)); + return PI; + } + llvm_unreachable( + "Instruction unsupported even for grid_constant argument"); + } + llvm_unreachable("Unsupported instruction"); }; @@ -213,8 +300,8 @@ static void convertToParamAS(Value *OldUser, Value *Param) { // We've created a new instruction. Queue users of the old instruction to // be converted and the instruction itself to be deleted. We can't delete // the old instruction yet, because it's still in use by a load somewhere. - for (Value *V : I.OldInstruction->users()) - ItemsToConvert.push_back({cast(V), NewInst}); + for (Use &U : I.OldInstruction->uses()) + ItemsToConvert.push_back({&U, cast(U.getUser()), NewInst}); InstructionsToDelete.push_back(I.OldInstruction); } @@ -272,6 +359,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, SmallVector Loads; std::queue Worklist; Worklist.push({ArgInParamAS, 0}); + bool IsGridConstant = isParamGridConstant(*Arg); while (!Worklist.empty()) { LoadContext Ctx = Worklist.front(); @@ -303,8 +391,14 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, continue; } + // supported for grid_constant + if (IsGridConstant && + (isa(CurUser) || isa(CurUser) || + isa(CurUser))) + continue; + llvm_unreachable("All users must be one of: load, " - "bitcast, getelementptr."); + "bitcast, getelementptr, call, store, ptrtoint"); } } @@ -317,14 +411,15 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { + bool IsGridConstant = isParamGridConstant(*Arg); Function *Func = Arg->getParent(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); Type *StructType = Arg->getParamByValType(); assert(StructType && "Missing byval type"); - auto IsALoadChain = [&](Value *Start) { + auto AreSupportedUsers = [&](Value *Start) { SmallVector ValuesToCheck = {Start}; - auto IsALoadChainInstr = [](Value *V) -> bool { + auto IsSupportedUse = [IsGridConstant](Value *V) -> bool { if (isa(V) || isa(V) || isa(V)) return true; // ASC to param space are OK, too -- we'll just strip them. @@ -332,34 +427,43 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM) return true; } + // Simple calls and stores are supported for grid_constants + // writes to these pointers are undefined behaviour + if (IsGridConstant && + (isa(V) || isa(V) || isa(V))) + return true; return false; }; while (!ValuesToCheck.empty()) { Value *V = ValuesToCheck.pop_back_val(); - if (!IsALoadChainInstr(V)) { + if (!IsSupportedUse(V)) { LLVM_DEBUG(dbgs() << "Need a " << (isParamGridConstant(*Arg) ? "cast " : "copy ") << "of " << *Arg << " because of " << *V << "\n"); (void)Arg; return false; } - if (!isa(V)) + if (!isa(V) && !isa(V) && !isa(V) && + !isa(V)) llvm::append_range(ValuesToCheck, V->users()); } return true; }; - if (llvm::all_of(Arg->users(), IsALoadChain)) { + if (llvm::all_of(Arg->users(), AreSupportedUsers)) { // Convert all loads and intermediate operations to use parameter AS and // skip creation of a local copy of the argument. - SmallVector UsersToUpdate(Arg->users()); + SmallVector UsesToUpdate; + for (Use &U : Arg->uses()) + UsesToUpdate.push_back(&U); + Value *ArgInParamAS = new AddrSpaceCastInst( Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); - for (Value *V : UsersToUpdate) - convertToParamAS(V, ArgInParamAS); - LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n"); + for (Use *U : UsesToUpdate) + convertToParamAS(U, ArgInParamAS, IsGridConstant); + LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n"); const auto *TLI = cast(TM.getSubtargetImpl()->getTargetLowering()); @@ -376,16 +480,11 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // temporary copy. When a pointer might have escaped, conservatively replace // all of its uses (which might include a device function call) with a cast // to the generic address space. - // TODO: only cast byval grid constant parameters at use points that need - // generic address (e.g., merging parameter pointers with other address - // space, or escaping to call-sites, inline-asm, memory), and use the - // parameter address space for normal loads. IRBuilder<> IRB(&Func->getEntryBlock().front()); // Cast argument to param address space - auto *CastToParam = - cast(IRB.CreateAddrSpaceCast( - Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param")); + auto *CastToParam = cast(IRB.CreateAddrSpaceCast( + Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param")); // Cast param address to generic address space. We do not use an // addrspacecast to generic here, because, LLVM considers `Arg` to be in the diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index e4b2ec868519c..80361744fd5b6 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -210,7 +210,8 @@ bool isParamGridConstant(const Value &V) { if (const Argument *Arg = dyn_cast(&V)) { // "grid_constant" counts argument indices starting from 1 if (Arg->hasByValAttr() && - argHasNVVMAnnotation(*Arg, "grid_constant", /*StartArgIndexAtOne*/true)) { + argHasNVVMAnnotation(*Arg, "grid_constant", + /*StartArgIndexAtOne*/ true)) { assert(isKernelFunction(*Arg->getParent()) && "only kernel arguments can be grid_constant"); return true; diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 46f54e0e6f4d4..f6db9c429dba5 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT ; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX @@ -67,22 +67,22 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 ; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0; ; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2; ; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]]; -; PTX: cvta.param.{{.*}} [[RD4:%.*]], [[RD3]]; -; PTX: mov.u64 [[RD5:%.*]], [[RD1]]; -; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD5]]; +; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]]; +; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]]; +; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]]; ; PTX: { -; PTX: st.param.b64 [param0+0], [[RD6]]; -; PTX: st.param.b64 [param2+0], [[RD4]]; +; PTX: st.param.b64 [param0+0], [[RD5]]; +; PTX: st.param.b64 [param2+0], [[RD6]]; ; ; OPT-LABEL: define void @multiple_grid_const_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) { -; OPT-NOT: alloca i32 ; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) -; OPT-NOT: alloca [[STRUCT_S]] ; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NOT: alloca %struct.s +; OPT: [[A_ADDR:%.*]] = alloca i32, align 4 ; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, ptr [[B_PARAM_GEN]]) +; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) ; %a.addr = alloca i32, align 4 store i32 %a, ptr %a.addr, align 4 @@ -111,17 +111,19 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr % define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( ; PTX-NOT .local -; PTX: cvta.param.u64 [[RD2:%.*]], {{.*}} -; PTX: add.{{.*}} [[RD3:%.*]], [[RD2]], 4; -; PTX: add.s64 [[RD1:%.*]], [[RD2]], [[RD3]]; +; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4; +; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]] +; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]] +; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]]; ; ; OPT-LABEL: define void @grid_const_inlineasm_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) { ; OPT-NOT: alloca [[STRUCT_S]] ; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 0 -; OPT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 1 +; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0 +; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1 +; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]]) +; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]]) ; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 ; %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 @@ -131,10 +133,200 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt ret void } +define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { +; PTX-LABEL: grid_const_partial_escape( +; PTX-NOT: .local +; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0]; +; PTX: add.{{.*}} +; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}} +; PTX: st.param.{{.*}} [param0+0], [[RD3]] +; PTX: call +; +; OPT-LABEL: define void @grid_const_partial_escape( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) { +; OPT-NOT: alloca +; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4 +; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]] +; OPT: store i32 [[TWICE]] +; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) +; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) +; OPT: ret void +; + %val = load i32, ptr %input + %twice = add i32 %val, %val + store i32 %twice, ptr %output + %call = call i32 @escape(ptr %input) + ret void +} + +define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { +; PTX-LABEL: grid_const_partial_escapemem( +; PTX: { +; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0]; +; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4]; +; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}}; +; PTX: st.global.{{.*}} [{{.*}}], [[RD5]]; +; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]] +; PTX: st.param.{{.*}} [param0+0], [[RD5]] +; PTX: escape +; OPT-LABEL: define i32 @grid_const_partial_escapemem( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) { +; OPT-NOT: alloca +; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0 +; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4 +; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1 +; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4 +; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT: store ptr [[INPUT1]] +; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] +; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]]) +; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) +; + %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 + %val1 = load i32, ptr %ptr1 + %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 + %val2 = load i32, ptr %ptr2 + store ptr %input, ptr %output + %add = add i32 %val1, %val2 + %call2 = call i32 @escape(ptr %ptr1) + ret i32 %add +} + +define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +; PTX-LABEL: grid_const_phi_escape( +; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}} +; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]]; +; PTX: $L__BB[[TARGET_LABEL]]: +; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]]; +; +; OPT-LABEL: define void @grid_const_phi_escape( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) { +; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]] +; OPT: [[FIRST]]: +; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT: br label %[[MERGE:.*]] +; OPT: [[SECOND]]: +; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 +; OPT: br label %[[MERGE]] +; OPT: [[MERGE]]: +; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NOT: load i32, ptr addrspace(101) +; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; + + %val = load i32, ptr %inout + %less = icmp slt i32 %val, 0 + br i1 %less, label %first, label %second +first: + %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 + br label %merge +second: + %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1 + br label %merge +merge: + %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] + %valloaded = load i32, ptr %ptrnew + store i32 %valloaded, ptr %inout + ret void +} + +; NOTE: %input2 is *not* grid_constant +define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +; PTX-LABEL: grid_const_phi_escape2( +; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4]; +; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]]; +; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]]; +; PTX: ld.u32 [[R1]], [[[RD1]]]; +; PTX: $L__BB[[LABEL]]: +; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]] +; OPT-LABEL: define void @grid_const_phi_escape2( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) { +; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) +; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; OPT: [[FIRST]]: +; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; OPT: br label %[[MERGE:.*]] +; OPT: [[SECOND]]: +; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; OPT: br label %[[MERGE]] +; OPT: [[MERGE]]: +; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; + %val = load i32, ptr %inout + %less = icmp slt i32 %val, 0 + br i1 %less, label %first, label %second +first: + %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0 + br label %merge +second: + %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1 + br label %merge +merge: + %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second] + %valloaded = load i32, ptr %ptrnew + store i32 %valloaded, ptr %inout + ret void +} + +; NOTE: %input2 is *not* grid_constant +define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +; PTX-LABEL: grid_const_select_escape( +; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]] +; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0 +; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0; +; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]]; +; PTX: ld.u32 {{%.*}}, [[[RD4]]]; +; OPT-LABEL: define void @grid_const_select_escape( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) { +; OPT: [[INPUT24:%.*]] = alloca i32, align 4 +; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) +; OPT: load i32, ptr [[INOUT]] +; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]] +; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; + %val = load i32, ptr %inout + %less = icmp slt i32 %val, 0 + %ptrnew = select i1 %less, ptr %input1, ptr %input2 + %valloaded = load i32, ptr %ptrnew + store i32 %valloaded, ptr %inout + ret void +} + +define i32 @grid_const_ptrtoint(ptr byval(i32) %input) { +; PTX-LABEL: grid_const_ptrtoint( +; PTX-NOT: .local +; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0]; +; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}} +; PTX: cvt.u32.u64 {{%.*}}, [[RD1]] +; OPT-LABEL: define i32 @grid_const_ptrtoint( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) { +; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]] +; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 + %val = load i32, ptr %input + %ptrval = ptrtoint ptr %input to i32 + %keepalive = add i32 %val, %ptrval + ret i32 %keepalive +} + + + +declare dso_local void @dummy() local_unnamed_addr declare dso_local ptr @escape(ptr) local_unnamed_addr declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11} +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23} !0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1} !1 = !{i32 1} @@ -153,3 +345,21 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11} !11 = !{i32 1} + +!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13} +!13 = !{i32 1} + +!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15} +!15 = !{i32 1} + +!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17} +!17 = !{i32 1} + +!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19} +!19 = !{i32 1} + +!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21} +!21 = !{i32 1} + +!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23} +!23 = !{i32 1}