diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 93895048faaf3..d09e1da457249 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1909,6 +1909,22 @@ def int_nvvm_ptr_param_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable, IntrNoCallback], "llvm.nvvm.ptr.param.to.gen">; +// Represents an explicit hole in the LLVM IR type system. It may be inserted by +// the compiler in cases where a pointer is of the wrong type. In the backend +// this intrinsic will be folded away and not equate to any instruction. It +// should not be used by any frontend and should only be considered well defined +// when added in the following cases: +// +// - NVPTXLowerArgs: When wrapping a byval pointer argument to a kernel +// function to convert the address space from generic (0) to param (101). +// This accounts for the fact that the parameter symbols will occupy this +// space when lowered during ISel. +// +def int_nvvm_internal_addrspace_wrap : + DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], + [IntrNoMem, IntrSpeculatable, NoUndef>, + NoUndef]>; + // Move intrinsics, used in nvvm internally def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem], diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ec1f969494cd1..486c7c815435a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -985,6 +985,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { case ADDRESS_SPACE_LOCAL: Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local; break; + case ADDRESS_SPACE_PARAM: + Opc = TM.is64Bit() ? NVPTX::cvta_param_64 : NVPTX::cvta_param; + break; } ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src)); return; @@ -1008,7 +1011,7 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local; break; case ADDRESS_SPACE_PARAM: - Opc = TM.is64Bit() ? NVPTX::IMOV64r : NVPTX::IMOV32r; + Opc = TM.is64Bit() ? NVPTX::cvta_to_param_64 : NVPTX::cvta_to_param; break; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 277a34173e7b8..49f4f30096f00 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1014,6 +1014,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v32i32, MVT::v64i32, MVT::v128i32}, Custom); + + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -1426,6 +1428,17 @@ static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, return MachinePointerInfo(ADDRESS_SPACE_LOCAL); } + + // Peel of an addrspacecast to generic and load directly from the specific + // address space. + if (Ptr->getOpcode() == ISD::ADDRSPACECAST) { + const auto *ASC = cast(Ptr); + if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) { + Ptr = ASC->getOperand(0); + return MachinePointerInfo(ASC->getSrcAddressSpace()); + } + } + return MachinePointerInfo(); } @@ -2746,6 +2759,15 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { return Op; } +static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { + switch (Op->getConstantOperandVal(0)) { + default: + return Op; + case Intrinsic::nvvm_internal_addrspace_wrap: + return Op.getOperand(1); + } +} + // In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value. // Lower these into a node returning the correct type which is zero-extended // back to the correct size. @@ -2889,6 +2911,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalAddress(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return Op; + case ISD::INTRINSIC_WO_CHAIN: + return lowerIntrinsicWOChain(Op, DAG); case ISD::INTRINSIC_VOID: return LowerIntrinsicVoid(Op, DAG); case ISD::BUILD_VECTOR: diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 9634ad8eb8da2..4ba3e6f06bb5f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2395,18 +2395,10 @@ multiclass G_TO_NG { "cvta.to." # Str # ".u64 \t$result, $src;", []>; } -defm cvta_local : NG_TO_G<"local">; -defm cvta_shared : NG_TO_G<"shared">; -defm cvta_global : NG_TO_G<"global">; -defm cvta_const : NG_TO_G<"const">; - -defm cvta_to_local : G_TO_NG<"local">; -defm cvta_to_shared : G_TO_NG<"shared">; -defm cvta_to_global : G_TO_NG<"global">; -defm cvta_to_const : G_TO_NG<"const">; - -// nvvm.ptr.param.to.gen -defm cvta_param : NG_TO_G<"param">; +foreach space = ["local", "shared", "global", "const", "param"] in { + defm cvta_#space : NG_TO_G; + defm cvta_to_#space : G_TO_NG; +} def : Pat<(int_nvvm_ptr_param_to_gen i32:$src), (cvta_param $src)>; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 6452fa05947dd..befb18fad8910 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -265,18 +265,9 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, if (HasCvtaParam) { auto GetParamAddrCastToGeneric = [](Value *Addr, Instruction *OriginalUser) -> Value * { - PointerType *ReturnTy = - PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC); - Function *CvtToGen = Intrinsic::getOrInsertDeclaration( - OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen, - {ReturnTy, PointerType::get(OriginalUser->getContext(), - ADDRESS_SPACE_PARAM)}); - - // Cast param address to generic address space - Value *CvtToGenCall = - CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen", - OriginalUser->getIterator()); - return CvtToGenCall; + IRBuilder<> IRB(OriginalUser); + Type *GenTy = IRB.getPtrTy(ADDRESS_SPACE_GENERIC); + return IRB.CreateAddrSpaceCast(Addr, GenTy, Addr->getName() + ".gen"); }; auto *ParamInGenericAS = GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction); @@ -515,23 +506,24 @@ void copyByValParam(Function &F, Argument &Arg) { BasicBlock::iterator FirstInst = F.getEntryBlock().begin(); Type *StructType = Arg.getParamByValType(); const DataLayout &DL = F.getDataLayout(); - AllocaInst *AllocA = new AllocaInst(StructType, DL.getAllocaAddrSpace(), - Arg.getName(), FirstInst); + IRBuilder<> IRB(&*FirstInst); + AllocaInst *AllocA = IRB.CreateAlloca(StructType, nullptr, Arg.getName()); // Set the alignment to alignment of the byval parameter. This is because, // later load/stores assume that alignment, and we are going to replace // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(F.getParamAlign(Arg.getArgNo()) - .value_or(DL.getPrefTypeAlign(StructType))); + AllocA->setAlignment( + Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType))); Arg.replaceAllUsesWith(AllocA); - Value *ArgInParam = new AddrSpaceCastInst( - &Arg, PointerType::get(Arg.getContext(), ADDRESS_SPACE_PARAM), - Arg.getName(), FirstInst); + Value *ArgInParam = + IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap, + {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()}, + &Arg, {}, Arg.getName()); + // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX // addrspacecast preserves alignment. Since params are constant, this load // is definitely not volatile. const auto ArgSize = *AllocA->getAllocationSize(DL); - IRBuilder<> IRB(&*FirstInst); IRB.CreateMemCpy(AllocA, AllocA->getAlign(), ArgInParam, AllocA->getAlign(), ArgSize); } @@ -539,9 +531,9 @@ void copyByValParam(Function &F, Argument &Arg) { static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { Function *Func = Arg->getParent(); - bool HasCvtaParam = - TM.getSubtargetImpl(*Func)->hasCvtaParam() && isKernelFunction(*Func); - bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); + assert(isKernelFunction(*Func)); + const bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam(); + const bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); const DataLayout &DL = Func->getDataLayout(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); Type *StructType = Arg->getParamByValType(); @@ -558,9 +550,11 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { for (Use &U : Arg->uses()) UsesToUpdate.push_back(&U); - Value *ArgInParamAS = new AddrSpaceCastInst( - Arg, PointerType::get(StructType->getContext(), ADDRESS_SPACE_PARAM), - Arg->getName(), FirstInst); + IRBuilder<> IRB(&*FirstInst); + Value *ArgInParamAS = IRB.CreateIntrinsic( + Intrinsic::nvvm_internal_addrspace_wrap, + {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg}); + for (Use *U : UsesToUpdate) convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant); LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n"); @@ -578,30 +572,31 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { // However, we're still not allowed to write to it. If the user specified // `__grid_constant__` for the argument, we'll consider escaped pointer as // read-only. - if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) { + if (IsGridConstant || (HasCvtaParam && ArgUseIsReadOnly)) { LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); // Replace all argument pointer uses (which might include a device function // call) with a cast to the generic address space using cvta.param // instruction, which avoids a local copy. IRBuilder<> IRB(&Func->getEntryBlock().front()); - // Cast argument to param address space - auto *CastToParam = cast(IRB.CreateAddrSpaceCast( - Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param")); + // Cast argument to param address space. Because the backend will emit the + // argument already in the param address space, we need to use the noop + // intrinsic, this had the added benefit of preventing other optimizations + // from folding away this pair of addrspacecasts. + auto *ParamSpaceArg = + IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap, + {IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, + Arg, {}, Arg->getName() + ".param"); - // Cast param address to generic address space. We do not use an - // addrspacecast to generic here, because, LLVM considers `Arg` to be in the - // generic address space, and a `generic -> param` cast followed by a `param - // -> generic` cast will be folded away. The `param -> generic` intrinsic - // will be correctly lowered to `cvta.param`. - Value *CvtToGenCall = IRB.CreateIntrinsic( - IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen, - CastToParam, nullptr, CastToParam->getName() + ".gen"); + // Cast param address to generic address space. + Value *GenericArg = IRB.CreateAddrSpaceCast( + ParamSpaceArg, IRB.getPtrTy(ADDRESS_SPACE_GENERIC), + Arg->getName() + ".gen"); - Arg->replaceAllUsesWith(CvtToGenCall); + Arg->replaceAllUsesWith(GenericArg); // Do not replace Arg in the cast to param space - CastToParam->setOperand(0, Arg); + ParamSpaceArg->setOperand(0, Arg); } else copyByValParam(*Func, *Arg); } @@ -715,12 +710,14 @@ static bool copyFunctionByValArgs(Function &F) { LLVM_DEBUG(dbgs() << "Creating a copy of byval args of " << F.getName() << "\n"); bool Changed = false; - for (Argument &Arg : F.args()) - if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() && - !(isParamGridConstant(Arg) && isKernelFunction(F))) { - copyByValParam(F, Arg); - Changed = true; - } + if (isKernelFunction(F)) { + for (Argument &Arg : F.args()) + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr() && + !isParamGridConstant(Arg)) { + copyByValParam(F, Arg); + Changed = true; + } + } return Changed; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index b800445a3b19c..61b50b69b4e86 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -16,11 +16,13 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/Argument.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/ModRef.h" #include "llvm/Support/Mutex.h" #include #include @@ -228,17 +230,30 @@ static std::optional getVectorProduct(ArrayRef V) { return std::accumulate(V.begin(), V.end(), 1, std::multiplies{}); } -bool isParamGridConstant(const Value &V) { - if (const Argument *Arg = dyn_cast(&V)) { - // "grid_constant" counts argument indices starting from 1 - if (Arg->hasByValAttr() && - argHasNVVMAnnotation(*Arg, "grid_constant", - /*StartArgIndexAtOne*/ true)) { - assert(isKernelFunction(*Arg->getParent()) && - "only kernel arguments can be grid_constant"); +bool isParamGridConstant(const Argument &Arg) { + assert(isKernelFunction(*Arg.getParent()) && + "only kernel arguments can be grid_constant"); + + if (!Arg.hasByValAttr()) + return false; + + // Lowering an argument as a grid_constant violates the byval semantics (and + // the C++ API) by reusing the same memory location for the argument across + // multiple threads. If an argument doesn't read memory and its address is not + // captured (its address is not compared with any value), then the tweak of + // the C++ API and byval semantics is unobservable by the program and we can + // lower the arg as a grid_constant. + if (Arg.onlyReadsMemory()) { + const auto CI = Arg.getAttributes().getCaptureInfo(); + if (!capturesAddress(CI) && !capturesFullProvenance(CI)) return true; - } } + + // "grid_constant" counts argument indices starting from 1 + if (argHasNVVMAnnotation(Arg, "grid_constant", + /*StartArgIndexAtOne*/ true)) + return true; + return false; } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 2288241ec0178..70bf02035fd48 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -63,7 +63,7 @@ inline bool isKernelFunction(const Function &F) { return F.getCallingConv() == CallingConv::PTX_Kernel; } -bool isParamGridConstant(const Value &); +bool isParamGridConstant(const Argument &); inline MaybeAlign getAlign(const Function &F, unsigned Index) { return F.getAttributes().getAttributes(Index).getStackAlignment(); diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll index 76300e3cfdc5b..33c6dbddd5297 100644 --- a/llvm/test/CodeGen/NVPTX/bug21465.ll +++ b/llvm/test/CodeGen/NVPTX/bug21465.ll @@ -12,7 +12,7 @@ define ptx_kernel void @_Z11TakesStruct1SPi(ptr byval(%struct.S) nocapture reado entry: ; CHECK-LABEL: @_Z11TakesStruct1SPi ; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi( -; CHECK: addrspacecast ptr %input to ptr addrspace(101) +; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr %input) %b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1 %0 = load i32, ptr %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll index 6d9710e6d2272..80ae8aac39115 100644 --- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll +++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll @@ -65,7 +65,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) { ; CHECK-LABEL: test_ld_param_byval( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0]; diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index 836a7d78a0cc5..dd172cf685380 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -72,7 +72,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] ; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 @@ -101,7 +101,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 @@ -122,16 +122,15 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0; -; PTX-NEXT: mov.b64 %rd3, %rd2; -; PTX-NEXT: cvta.param.u64 %rd4, %rd3; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; ; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd4; +; PTX-NEXT: st.param.b64 [param0], %rd3; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -145,8 +144,8 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) ; OPT-NEXT: ret void %call = call i32 @escape(ptr %input) @@ -160,29 +159,27 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<4>; -; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot4; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0; -; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2; -; PTX-NEXT: mov.b64 %rd4, %rd3; ; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; -; PTX-NEXT: cvta.param.u64 %rd5, %rd4; -; PTX-NEXT: mov.b64 %rd6, %rd2; -; PTX-NEXT: cvta.param.u64 %rd7, %rd6; -; PTX-NEXT: add.u64 %rd8, %SP, 0; -; PTX-NEXT: add.u64 %rd9, %SPL, 0; -; PTX-NEXT: st.local.u32 [%rd9], %r1; +; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2; +; PTX-NEXT: cvta.param.u64 %rd4, %rd3; +; PTX-NEXT: cvta.param.u64 %rd5, %rd2; +; PTX-NEXT: add.u64 %rd6, %SP, 0; +; PTX-NEXT: add.u64 %rd7, %SPL, 0; +; PTX-NEXT: st.local.u32 [%rd7], %r1; ; PTX-NEXT: mov.b64 %rd1, escape3; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd7; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b64 param1; -; PTX-NEXT: st.param.b64 [param1], %rd8; +; PTX-NEXT: st.param.b64 [param1], %rd6; ; PTX-NEXT: .param .b64 param2; -; PTX-NEXT: st.param.b64 [param2], %rd5; +; PTX-NEXT: st.param.b64 [param2], %rd4; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _); ; PTX-NEXT: call (retval0), @@ -198,10 +195,10 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) -; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]]) +; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr +; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 ; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) @@ -215,20 +212,19 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( ; PTX: { -; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; ; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; -; PTX-NEXT: mov.b64 %rd4, %rd1; -; PTX-NEXT: cvta.param.u64 %rd5, %rd4; -; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: cvta.param.u64 %rd4, %rd1; +; PTX-NEXT: st.global.u64 [%rd3], %rd4; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8 ; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 @@ -238,14 +234,13 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( ; PTX: { -; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; ; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; -; PTX-NEXT: mov.b64 %rd7, %rd4; -; PTX-NEXT: cvta.param.u64 %rd2, %rd7; +; PTX-NEXT: cvta.param.u64 %rd2, %rd4; ; PTX-NEXT: add.s64 %rd3, %rd2, 4; ; PTX-NEXT: // begin inline asm ; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; @@ -255,8 +250,8 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 @@ -273,21 +268,20 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-LABEL: grid_const_partial_escape( ; PTX: { ; PTX-NEXT: .reg .b32 %r<5>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0; ; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; -; PTX-NEXT: mov.b64 %rd5, %rd2; -; PTX-NEXT: cvta.param.u64 %rd6, %rd5; -; PTX-NEXT: ld.u32 %r1, [%rd6]; +; PTX-NEXT: cvta.param.u64 %rd5, %rd2; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escape_param_0]; ; PTX-NEXT: add.s32 %r2, %r1, %r1; ; PTX-NEXT: st.global.u32 [%rd4], %r2; ; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd6; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -301,8 +295,8 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 ; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] ; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4 @@ -319,22 +313,21 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { ; PTX-NEXT: .reg .b32 %r<6>; -; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0; ; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; -; PTX-NEXT: mov.b64 %rd5, %rd2; -; PTX-NEXT: cvta.param.u64 %rd6, %rd5; -; PTX-NEXT: ld.u32 %r1, [%rd6]; -; PTX-NEXT: ld.u32 %r2, [%rd6+4]; -; PTX-NEXT: st.global.u64 [%rd4], %rd6; +; PTX-NEXT: cvta.param.u64 %rd5, %rd2; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escapemem_param_0]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_partial_escapemem_param_0+4]; +; PTX-NEXT: st.global.u64 [%rd4], %rd5; ; PTX-NEXT: add.s32 %r3, %r1, %r2; ; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 3, 0 ; PTX-NEXT: .param .b64 param0; -; PTX-NEXT: st.param.b64 [param0], %rd6; +; PTX-NEXT: st.param.b64 [param0], %rd5; ; PTX-NEXT: .param .b32 retval0; ; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _); ; PTX-NEXT: call (retval0), @@ -349,8 +342,8 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 ; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 @@ -374,27 +367,25 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0; -; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; -; PTX-NEXT: mov.b64 %rd7, %rd5; -; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0; +; PTX-NEXT: ld.param.u64 %rd5, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5; ; PTX-NEXT: ld.global.u32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB9_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: add.s64 %rd8, %rd8, 4; +; PTX-NEXT: add.s64 %rd6, %rd6, 4; ; PTX-NEXT: $L__BB9_2: // %merge -; PTX-NEXT: ld.u32 %r2, [%rd8]; +; PTX-NEXT: ld.param.u32 %r2, [%rd6]; ; PTX-NEXT: st.global.u32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] @@ -432,32 +423,28 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<12>; +; PTX-NEXT: .reg .b64 %rd<8>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0; -; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2]; -; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7; -; PTX-NEXT: mov.b64 %rd10, %rd6; -; PTX-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; ; PTX-NEXT: ld.global.u32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB10_2; ; PTX-NEXT: // %bb.1: // %second -; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; -; PTX-NEXT: mov.b64 %rd9, %rd8; -; PTX-NEXT: cvta.param.u64 %rd2, %rd9; -; PTX-NEXT: add.s64 %rd11, %rd2, 4; +; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1; +; PTX-NEXT: add.s64 %rd7, %rd2, 4; ; PTX-NEXT: $L__BB10_2: // %merge -; PTX-NEXT: ld.u32 %r2, [%rd11]; +; PTX-NEXT: ld.param.u32 %r2, [%rd7]; ; PTX-NEXT: st.global.u32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) -; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr +; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] @@ -494,29 +481,25 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX: { ; PTX-NEXT: .reg .pred %p<2>; ; PTX-NEXT: .reg .b32 %r<3>; -; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; ; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; ; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; -; PTX-NEXT: mov.b64 %rd5, %rd4; -; PTX-NEXT: cvta.param.u64 %rd6, %rd5; -; PTX-NEXT: mov.b64 %rd7, %rd1; -; PTX-NEXT: cvta.param.u64 %rd8, %rd7; ; PTX-NEXT: ld.global.u32 %r1, [%rd3]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; -; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; -; PTX-NEXT: ld.u32 %r2, [%rd9]; +; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; +; PTX-NEXT: ld.param.u32 %r2, [%rd5]; ; PTX-NEXT: st.global.u32 [%rd3], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) -; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr +; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr ; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] @@ -535,22 +518,21 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-LABEL: grid_const_ptrtoint( ; PTX: { ; PTX-NEXT: .reg .b32 %r<4>; -; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; -; PTX-NEXT: mov.b64 %rd2, %rd1; ; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; -; PTX-NEXT: cvta.param.u64 %rd3, %rd2; -; PTX-NEXT: cvt.u32.u64 %r2, %rd3; +; PTX-NEXT: cvta.param.u64 %rd2, %rd1; +; PTX-NEXT: cvt.u32.u64 %r2, %rd2; ; PTX-NEXT: add.s32 %r3, %r1, %r2; ; PTX-NEXT: st.param.b32 [func_retval0], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) ; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 -; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr ; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 ; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] ; OPT-NEXT: ret i32 [[KEEPALIVE]] @@ -560,13 +542,42 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ret i32 %keepalive } +declare void @device_func(ptr byval(i32) align 4) + +define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { +; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr +; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]]) +; OPT-NEXT: ret void +; +; PTX-LABEL: test_forward_byval_arg( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0]; +; PTX-NEXT: { // callseq 4, 0 +; PTX-NEXT: .param .align 4 .b8 param0[4]; +; PTX-NEXT: st.param.b32 [param0], %r1; +; PTX-NEXT: call.uni +; PTX-NEXT: device_func, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 4 +; PTX-NEXT: ret; + call void @device_func(ptr byval(i32) align 4 %input) + ret void +} declare dso_local void @dummy() local_unnamed_addr declare dso_local ptr @escape(ptr) local_unnamed_addr declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr -!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23} +!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24} !0 = !{ptr @grid_const_int, !"grid_constant", !1} !1 = !{i32 1} @@ -604,4 +615,6 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23} !23 = !{i32 1} +!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25} +!25 = !{i32 1} diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 8fa7d5c3e0cbc..8e879871e295b 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) { ; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr( ; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { -; IRC-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8 ; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr ; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1) @@ -220,7 +220,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st ; ; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr( ; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) { -; IRO-NEXT: [[S1:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8 ; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr ; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4 diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 2d8684c7cab48..1304ffe42c7b5 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; LOWER-ARGS-NEXT: ret void @@ -40,10 +40,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; COPY-LABEL: define dso_local ptx_kernel void @read_only( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S1]], align 4 +; COPY-NEXT: [[I:%.*]] = load i32, ptr [[S]], align 4 ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; @@ -69,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 ; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 ; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 @@ -78,10 +75,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S]], i64 4 ; COPY-NEXT: [[I:%.*]] = load i32, ptr [[B]], align 4 ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void @@ -104,122 +98,51 @@ entry: ret void } -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 -; LOWER-ARGS-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; COPY-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[ASC]], align 4 -; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 -; COPY-NEXT: ret void -; -; PTX-LABEL: read_only_gep_asc( -; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0]; -; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4]; -; PTX-NEXT: st.global.u32 [%rd2], %r1; -; PTX-NEXT: ret; -entry: - %b = getelementptr inbounds nuw i8, ptr %s, i64 4 - %asc = addrspacecast ptr %b to ptr addrspace(101) - %i = load i32, ptr addrspace(101) %asc, align 4 - store i32 %i, ptr %out, align 4 - ret void -} +;; TODO: This test has been disabled because the addrspacecast is not legal on +;; sm_60, and not well supported within nvptx-lower-args. We should determine +;; in what cases it is safe to make assumptions about the address of a byval +;; parameter and improve our handling of addrspacecast in nvptx-lower-args. + +; define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; entry: +; %b = getelementptr inbounds nuw i8, ptr %s, i64 4 +; %asc = addrspacecast ptr %b to ptr addrspace(101) +; %i = load i32, ptr addrspace(101) %asc, align 4 +; store i32 %i, ptr %out, align 4 +; ret void +; } +; +; define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; entry: +; %b = getelementptr inbounds nuw i8, ptr %s, i64 4 +; %asc = addrspacecast ptr %b to ptr addrspace(101) +; %asc0 = addrspacecast ptr addrspace(101) %asc to ptr +; %i = load i32, ptr %asc0, align 4 +; store i32 %i, ptr %out, align 4 +; ret void +; } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr( +; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] ; COMMON-NEXT: ret void ; -; PTX-LABEL: read_only_gep_asc0( -; PTX: { -; PTX-NEXT: .reg .b32 %r<2>; -; PTX-NEXT: .reg .b64 %rd<3>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0]; -; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4]; -; PTX-NEXT: st.global.u32 [%rd2], %r1; -; PTX-NEXT: ret; -entry: - %b = getelementptr inbounds nuw i8, ptr %s, i64 4 - %asc = addrspacecast ptr %b to ptr addrspace(101) - %asc0 = addrspacecast ptr addrspace(101) %asc to ptr - %i = load i32, ptr %asc0, align 4 - store i32 %i, ptr %out, align 4 - ret void -} - -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr( -; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr( -; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr( -; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] -; COPY-NEXT: ret void -; ; PTX-LABEL: escape_ptr( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot2[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot4; +; PTX-NEXT: mov.b64 %SPL, __local_depot2; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; ; PTX-NEXT: add.u64 %rd2, %SPL, 0; @@ -244,46 +167,26 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( -; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( -; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( -; COPY-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( +; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] +; COMMON-NEXT: ret void ; ; PTX-LABEL: escape_ptr_gep( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot3[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<4>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot5; +; PTX-NEXT: mov.b64 %SPL, __local_depot3; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; ; PTX-NEXT: add.u64 %rd2, %SPL, 0; @@ -314,21 +217,21 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 ; COMMON-NEXT: ret void ; ; PTX-LABEL: escape_ptr_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot6; +; PTX-NEXT: mov.b64 %SPL, __local_depot4; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; @@ -351,7 +254,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 ; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8 @@ -359,14 +262,14 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; ; PTX-LABEL: escape_ptr_gep_store( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot7[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot7; +; PTX-NEXT: mov.b64 %SPL, __local_depot5; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; @@ -391,7 +294,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 ; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8 @@ -399,14 +302,14 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; ; PTX-LABEL: escape_ptrtoint( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot8[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot8; +; PTX-NEXT: mov.b64 %SPL, __local_depot6; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; @@ -429,17 +332,14 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) ; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) +; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true) ; COPY-NEXT: ret void ; ; PTX-LABEL: memcpy_from_param( @@ -492,17 +392,14 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound ; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( ; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; LOWER-ARGS-NEXT: [[ENTRY:.*:]] -; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) ; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 8 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[S1]], ptr addrspace(101) align 8 [[S2]], i64 8, i1 false) -; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) +; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S]], i64 16, i1 true) ; COPY-NEXT: ret void ; ; PTX-LABEL: memcpy_from_param_noalign( @@ -551,26 +448,26 @@ entry: } ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { ; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[ENTRY:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) ; COMMON-NEXT: ret void ; ; PTX-LABEL: memcpy_to_param( ; PTX: { -; PTX-NEXT: .local .align 8 .b8 __local_depot11[8]; +; PTX-NEXT: .local .align 8 .b8 __local_depot9[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .b32 %r<3>; ; PTX-NEXT: .reg .b64 %rd<48>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: mov.b64 %SPL, __local_depot11; +; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0]; ; PTX-NEXT: add.u64 %rd3, %SPL, 0; @@ -636,7 +533,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly % ; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 ; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4 @@ -659,10 +556,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_60-NEXT: [[BB:.*:]] ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 @@ -672,10 +569,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_70-LABEL: define ptx_kernel void @test_select( ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) -; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr +; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 ; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 @@ -685,10 +582,10 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; COPY-NEXT: [[BB:.*:]] ; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) ; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] ; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 @@ -719,7 +616,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; PTX_70-NEXT: .reg .pred %p<2>; ; PTX_70-NEXT: .reg .b16 %rs<3>; ; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<10>; +; PTX_70-NEXT: .reg .b64 %rd<6>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb ; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3]; @@ -729,12 +626,8 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2]; ; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; -; PTX_70-NEXT: mov.b64 %rd5, %rd4; -; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5; -; PTX_70-NEXT: mov.b64 %rd7, %rd1; -; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7; -; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; -; PTX_70-NEXT: ld.u32 %r1, [%rd9]; +; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; +; PTX_70-NEXT: ld.param.u32 %r1, [%rd5]; ; PTX_70-NEXT: st.global.u32 [%rd3], %r1; ; PTX_70-NEXT: ret; bb: @@ -749,10 +642,10 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 @@ -760,7 +653,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; ; PTX-LABEL: test_select_write( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot14[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot12[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .pred %p<2>; @@ -769,7 +662,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: .reg .b64 %rd<6>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb -; PTX-NEXT: mov.b64 %SPL, __local_depot14; +; PTX-NEXT: mov.b64 %SPL, __local_depot12; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; @@ -795,10 +688,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_60-NEXT: [[BB:.*:]] ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 -; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_60: [[FIRST]]: @@ -816,10 +709,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_70-LABEL: define ptx_kernel void @test_phi( ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) -; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr +; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr ; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; SM_70: [[FIRST]]: ; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 @@ -837,10 +730,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; COPY-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; COPY-NEXT: [[BB:.*:]] ; COPY-NEXT: [[INPUT23:%.*]] = alloca [[STRUCT_S]], align 8 -; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false) ; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COPY: [[FIRST]]: @@ -869,10 +762,10 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2]; ; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; ; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0]; -; PTX_60-NEXT: @%p1 bra $L__BB15_2; +; PTX_60-NEXT: @%p1 bra $L__BB13_2; ; PTX_60-NEXT: // %bb.1: // %second ; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4]; -; PTX_60-NEXT: $L__BB15_2: // %merge +; PTX_60-NEXT: $L__BB13_2: // %merge ; PTX_60-NEXT: st.global.u32 [%rd1], %r4; ; PTX_60-NEXT: ret; ; @@ -881,25 +774,21 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_70-NEXT: .reg .pred %p<2>; ; PTX_70-NEXT: .reg .b16 %rs<3>; ; PTX_70-NEXT: .reg .b32 %r<2>; -; PTX_70-NEXT: .reg .b64 %rd<12>; +; PTX_70-NEXT: .reg .b64 %rd<8>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb ; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0; -; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2]; -; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7; -; PTX_70-NEXT: mov.b64 %rd10, %rd6; -; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10; -; PTX_70-NEXT: @%p1 bra $L__BB15_2; +; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0; +; PTX_70-NEXT: ld.param.u64 %rd6, [test_phi_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX_70-NEXT: @%p1 bra $L__BB13_2; ; PTX_70-NEXT: // %bb.1: // %second -; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1; -; PTX_70-NEXT: mov.b64 %rd9, %rd8; -; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9; -; PTX_70-NEXT: add.s64 %rd11, %rd2, 4; -; PTX_70-NEXT: $L__BB15_2: // %merge -; PTX_70-NEXT: ld.u32 %r1, [%rd11]; +; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1; +; PTX_70-NEXT: add.s64 %rd7, %rd2, 4; +; PTX_70-NEXT: $L__BB13_2: // %merge +; PTX_70-NEXT: ld.param.u32 %r1, [%rd7]; ; PTX_70-NEXT: st.global.u32 [%rd1], %r1; ; PTX_70-NEXT: ret; bb: @@ -925,10 +814,10 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 -; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) ; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 -; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]]) ; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false) ; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; COMMON: [[FIRST]]: @@ -944,7 +833,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; ; PTX-LABEL: test_phi_write( ; PTX: { -; PTX-NEXT: .local .align 4 .b8 __local_depot16[8]; +; PTX-NEXT: .local .align 4 .b8 __local_depot14[8]; ; PTX-NEXT: .reg .b64 %SP; ; PTX-NEXT: .reg .b64 %SPL; ; PTX-NEXT: .reg .pred %p<2>; @@ -953,7 +842,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %bb -; PTX-NEXT: mov.b64 %SPL, __local_depot16; +; PTX-NEXT: mov.b64 %SPL, __local_depot14; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; @@ -964,10 +853,10 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: add.u64 %rd6, %SPL, 4; ; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0]; ; PTX-NEXT: st.u32 [%SP+4], %r2; -; PTX-NEXT: @%p1 bra $L__BB16_2; +; PTX-NEXT: @%p1 bra $L__BB14_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd6, %rd1; -; PTX-NEXT: $L__BB16_2: // %merge +; PTX-NEXT: $L__BB14_2: // %merge ; PTX-NEXT: mov.b32 %r3, 1; ; PTX-NEXT: st.local.u32 [%rd6], %r3; ; PTX-NEXT: ret; @@ -988,6 +877,69 @@ merge: ; preds = %second, %first ret void } +define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { +; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] { +; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]]) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false) +; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]]) +; COMMON-NEXT: ret void +; +; PTX-LABEL: test_forward_byval_arg( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot15[4]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %SPL, __local_depot15; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0]; +; PTX-NEXT: st.local.u32 [%rd2], %r1; +; PTX-NEXT: { // callseq 2, 0 +; PTX-NEXT: .param .align 4 .b8 param0[4]; +; PTX-NEXT: st.param.b32 [param0], %r1; +; PTX-NEXT: call.uni +; PTX-NEXT: device_func, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 2 +; PTX-NEXT: ret; + call void @device_func(ptr byval(i32) align 4 %input) + ret void +} + +define void @device_func(ptr byval(i32) align 4 %input) { +; COMMON-LABEL: define void @device_func( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] { +; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT]]) +; COMMON-NEXT: ret void +; +; PTX-LABEL: device_func( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0]; +; PTX-NEXT: { // callseq 3, 0 +; PTX-NEXT: .param .align 4 .b8 param0[4]; +; PTX-NEXT: st.param.b32 [param0], %r1; +; PTX-NEXT: call.uni +; PTX-NEXT: device_func, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 3 +; PTX-NEXT: ret; + call void @device_func(ptr byval(i32) align 4 %input) + ret void +} + attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index e470569bfae19..ad0b11ed6a806 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -6,7 +6,7 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) { ; CHECK-LABEL: caller_St8x4( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: @@ -27,11 +27,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.u32 %r3, [caller_St8x4_param_1]; -; CHECK-NEXT: st.u64 [%r3], %rd5; -; CHECK-NEXT: st.u64 [%r3+8], %rd6; -; CHECK-NEXT: st.u64 [%r3+16], %rd7; -; CHECK-NEXT: st.u64 [%r3+24], %rd8; +; CHECK-NEXT: ld.param.u32 %r2, [caller_St8x4_param_1]; +; CHECK-NEXT: st.u64 [%r2], %rd5; +; CHECK-NEXT: st.u64 [%r2+8], %rd6; +; CHECK-NEXT: st.u64 [%r2+16], %rd7; +; CHECK-NEXT: st.u64 [%r2+24], %rd8; ; CHECK-NEXT: ret; %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2 %.fca.0.extract = extractvalue [4 x i64] %call, 0