diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index 197d01447b9a4..b3b41b0c48863 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -375,20 +375,29 @@ using LocalsSet = SmallPtrSet; static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow, IRBuilder<> &Builder, bool Loc2Shadow) { Type *T = nullptr; - int LocAlignN = 0; + MaybeAlign LocAlign(0); if (const auto *AI = dyn_cast(L)) { T = AI->getAllocatedType(); - LocAlignN = AI->getAlignment(); + LocAlign = MaybeAlign(AI->getAlignment()); } else { - T = cast(L)->getParamByValType(); - LocAlignN = cast(L)->getParamAlignment(); + if (cast(L)->hasByValAttr()) { + T = cast(L)->getParamByValType(); + LocAlign = MaybeAlign(cast(L)->getParamAlignment()); + } else { + Type *Ty = cast(L)->getType(); + Module &M = *Shadow->getParent(); + LocAlign = M.getDataLayout().getValueOrABITypeAlignment( + MaybeAlign(cast(L)->getParamAlignment()), Ty); + auto PtrTy = dyn_cast(cast(L)->getType()); + assert(PtrTy && "Expected pointer type"); + T = PtrTy->getElementType(); + } } if (T->isAggregateType()) { // TODO: we should use methods which directly return MaybeAlign once such // are added to LLVM for AllocaInst and GlobalVariable - auto LocAlign = MaybeAlign(LocAlignN); auto ShdAlign = MaybeAlign(Shadow->getAlignment()); Module &M = *Shadow->getParent(); auto SizeVal = M.getDataLayout().getTypeStoreSize(T); @@ -679,10 +688,25 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) { // Go through "byval" parameters which are passed as AS(0) pointers // and: (1) create local shadows for them (2) and initialize them from the // leader's copy and (3) replace usages with pointer to the shadow -static void shareByValParams(Function &F, const Triple &TT) { - // split +// +// Do the same for 'this' pointer which points to PFWG lamda object which is +// allocated in the caller. Caller is a kernel function which is generated by +// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes +// captured objects (like accessors) using arguments of the kernel. After +// intialization kernel calls PFWG function (which is the operator() of the PFWG +// object). PFWG object captures all objects by value and all uses (except +// initialization from kernel arguments) of this values can only be in scope of +// PFWG function that is why copy back of PFWG object is not needed. +static void sharePFWGPrivateObjects(Function &F, const Triple &TT) { + // Skip alloca instructions and split. Alloca instructions must be in the + // beginning of the function otherwise they are considered as dynamic which + // can cause the problems with inlining. BasicBlock *EntryBB = &F.getEntryBlock(); - BasicBlock *LeaderBB = EntryBB->splitBasicBlock(&EntryBB->front(), "leader"); + Instruction *SplitPoint = &*EntryBB->begin(); + for (; SplitPoint->getOpcode() == Instruction::Alloca; + SplitPoint = SplitPoint->getNextNode()) + ; + BasicBlock *LeaderBB = EntryBB->splitBasicBlock(SplitPoint, "leader"); BasicBlock *MergeBB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "merge"); // 1) rewire the above basic blocks so that LeaderBB is executed only for the @@ -692,38 +716,48 @@ static void shareByValParams(Function &F, const Triple &TT) { Instruction &At = LeaderBB->back(); for (auto &Arg : F.args()) { - if (!Arg.hasByValAttr()) - continue; - assert(Arg.getType()->getPointerAddressSpace() == - asUInt(spirv::AddrSpace::Private)); - Type *T = Arg.getParamByValType(); - - // 2) create the shared copy - "shadow" - for current byval arg - GlobalVariable *Shadow = - spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); + Type *T; + LLVMContext &Ctx = At.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&LeaderBB->front()); - // 3) replace argument with shadow in all uses - Value *RepVal = Shadow; - if (TT.isNVPTX()) { - // For NVPTX target address space inference for kernel arguments and - // allocas is happening in the backend (NVPTXLowerArgs and - // NVPTXLowerAlloca passes). After the frontend these pointers are in LLVM - // default address space 0 which is the generic address space for NVPTX - // target. - assert(Arg.getType()->getPointerAddressSpace() == 0); - - // Cast a pointer in the shared address space to the generic address - // space. + // 2) create the shared copy - "shadow" - for current arg + GlobalVariable *Shadow; + Value *RepVal; + if (Arg.hasByValAttr()) { + assert(Arg.getType()->getPointerAddressSpace() == + asUInt(spirv::AddrSpace::Private)); + T = Arg.getParamByValType(); + Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); + RepVal = Shadow; + if (TT.isNVPTX()) { + // For NVPTX target address space inference for kernel arguments and + // allocas is happening in the backend (NVPTXLowerArgs and + // NVPTXLowerAlloca passes). After the frontend these pointers are in + // LLVM default address space 0 which is the generic address space for + // NVPTX target. + assert(Arg.getType()->getPointerAddressSpace() == 0); + + // Cast a pointer in the shared address space to the generic address + // space. + RepVal = ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, + Arg.getType()); + } + } + // Process 'this' pointer which points to PFWG lambda object + else if (Arg.getArgNo() == 0) { + PointerType *PtrT = dyn_cast(Arg.getType()); + assert(PtrT && "Expected this pointer as the first argument"); + T = PtrT->getElementType(); + Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow"); RepVal = - ConstantExpr::getPointerBitCastOrAddrSpaceCast(Shadow, Arg.getType()); + Builder.CreatePointerBitCastOrAddrSpaceCast(Shadow, Arg.getType()); } + + // 3) replace argument with shadow in all uses for (auto *U : Arg.users()) U->replaceUsesOfWith(&Arg, RepVal); - // 4) fill the shadow from the argument for the leader WI only - LLVMContext &Ctx = At.getContext(); - IRBuilder<> Builder(Ctx); - Builder.SetInsertPoint(&LeaderBB->front()); copyBetweenPrivateAndShadow(&Arg, Shadow, Builder, true /*private->shadow*/); } @@ -832,8 +866,9 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, const llvm::Triple &TT, for (auto *PFWICall : PFWICalls) fixupPrivateMemoryPFWILambdaCaptures(PFWICall); - // Finally, create shadows for and replace usages of byval pointer params - shareByValParams(F, TT); + // Finally, create shadows for and replace usages of byval pointer params and + // PFWG lambda object ('this' pointer). + sharePFWGPrivateObjects(F, TT); #ifndef NDEBUG if (HaveChanges && Debug > 0) diff --git a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll index b73087847dcb4..09ba788316dee 100644 --- a/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll +++ b/llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll @@ -13,51 +13,55 @@ %struct.foo = type { %struct.barney } %struct.foo.0 = type { i8 } -; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* +; CHECK: @[[GROUP_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.zot addrspace(4)* +; CHECK: @[[PFWG_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)* ; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0 +; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar ; CHECK: @[[GROUP_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.zot define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.zot* byval(%struct.zot) align 8 %arg1) align 2 !work_group_scope !0 { ; CHECK-LABEL: @wibble( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex ; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]] ; CHECK: leader: ; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.zot* [[ARG1:%.*]] to i8* ; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 96, i1 false) +; CHECK-NEXT: [[ARG_CAST:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i8 addrspace(4)* align 8 [[ARG_CAST]], i64 1, i1 false) ; CHECK-NEXT: br label [[MERGE]] ; CHECK: merge: -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1 -; CHECK-NEXT: [[ID:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex -; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[ID]], 0 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0 ; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]] ; CHECK: wg_leader: -; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* addrspacecast (%struct.bar addrspace(3)* @[[PFWG_SHADOW]] to [[STRUCT_BAR]] addrspace(4)*), [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_ZOT:%.*]] addrspace(3)* @[[GROUP_SHADOW]] to [[STRUCT_ZOT]] addrspace(4)* -; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]] ; CHECK-NEXT: br label [[WG_CF]] ; CHECK: wg_cf: -; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex -; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex +; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0 ; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]] ; CHECK: TestMat: -; CHECK-NEXT: [[TMP4:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP4]], i64 1, i1 false) +; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP5]], i64 1, i1 false) ; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]] -; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]] ; CHECK-NEXT: br label [[LEADERMAT]] ; CHECK: LeaderMat: -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW]] +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]] ; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* -; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP5]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false) -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) -; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @wibbleWG_tmp4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP6]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false) +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0 +; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]] ; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[WG_VAL_TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP2]]) ; CHECK-NEXT: ret void ;