Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,22 @@ static bool hasCallToAFuncWithWGMetadata(Function &F) {
return false;
}

// Recursively searches for a call to a function with parallel_for_work_item
// metadata inside F.
static bool hasCallToAFuncWithPFWIMetadata(Function &F) {
for (auto &BB : F)
for (auto &I : BB) {
if (isCallToAFuncMarkedWithMD(&I, PFWI_MD))
return true;
const CallInst *Call = dyn_cast<CallInst>(&I);
Function *F = dyn_cast_or_null<Function>(Call ? Call->getCalledFunction()
: nullptr);
if (F && hasCallToAFuncWithPFWIMetadata(*F))
return true;
}
return false;
}

// Checks if this is a call to parallel_for_work_item.
static bool isPFWICall(const Instruction *I) {
return isCallToAFuncMarkedWithMD(I, PFWI_MD);
Expand Down Expand Up @@ -835,7 +851,14 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
}
continue;
}
if (!mayHaveSideEffects(I))
// In addition to an instruction not having side effects, we end the range
// if the instruction is a call that contains, possibly several layers
// down the stack, a call to a parallel_for_work_item. Such calls should
// not be subject to lowering since they must be executed by every work
// item.
const CallInst *Call = dyn_cast<CallInst>(I);
if (!mayHaveSideEffects(I) ||
(Call && hasCallToAFuncWithPFWIMetadata(*Call->getCalledFunction())))
continue;
LLVM_DEBUG(llvm::dbgs() << "+++ Side effects: " << *I << "\n");
if (!First)
Expand Down
54 changes: 54 additions & 0 deletions llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
; are properly handled by LowerWGScope pass. Check that WG-shared local "shadow" variables are created
; and before each PFWI invocation leader WI stores its private copy of the variable into the shadow,
; then all WIs load the shadow value into their private copies ("materialize" the private copy).
; Also check that an indirect call to a function marked with parallel_for_work_item is treated
; the same as a direct call.

%struct.bar = type { i8 }
%struct.zot = type { %struct.widget, %struct.widget, %struct.widget, %struct.foo }
Expand Down Expand Up @@ -54,6 +56,7 @@ define internal spir_func void @wibble(ptr addrspace(4) %arg, ptr byval(%struct.
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4)
; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]])
; CHECK-NEXT: call spir_func void @foo(ptr addrspace(4) [[TMP9]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]])
; CHECK-NEXT: ret void
;
bb:
Expand All @@ -62,6 +65,57 @@ bb:
store ptr addrspace(4) %arg, ptr %0, align 8
%2 = addrspacecast ptr %arg1 to ptr addrspace(4)
call spir_func void @bar(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1)
call spir_func void @foo(ptr addrspace(4) %2, ptr byval(%struct.foo.0) align 1 %1)
ret void
}

define internal spir_func void @foo(ptr addrspace(4) %arg, ptr byval(%struct.foo.0) align 1 %arg1) align 2 !work_group_scope !0 {
; CHECK: bb:
; CHECK-NEXT: [[TMP0:%.*]] = alloca ptr addrspace(4), align 8
; CHECK-NEXT: [[TMP1:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP2]], 0
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @ArgShadow.4, ptr align 1 [[ARG1:%.*]], i64 1, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[ARG1]], ptr addrspace(3) align 8 @ArgShadow.4, i64 1, i1 false)
; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP3]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store ptr addrspace(4) [[ARG:%.*]], ptr [[TMP0]], align 8
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
; CHECK: TestMat:
; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 8 @WGCopy.3, ptr align 1 [[TMP1]], i64 1, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load ptr addrspace(4), ptr [[TMP0]], align 8
; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD]], ptr addrspace(3) @WGCopy.2, align 8
; CHECK-NEXT: br label [[LEADERMAT]]
; CHECK: LeaderMat:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[MAT_LD1:%.*]] = load ptr addrspace(4), ptr addrspace(3) @WGCopy.2, align 8
; CHECK-NEXT: store ptr addrspace(4) [[MAT_LD1]], ptr [[TMP0]], align 8
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 1 [[TMP1]], ptr addrspace(3) align 8 @WGCopy.3, i64 1, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[ARG1]] to ptr addrspace(4)
; CHECK-NEXT: call spir_func void @bar(ptr addrspace(4) [[TMP5]], ptr byval([[STRUCT_FOO_0]]) align 1 [[TMP1]])
; CHECK-NEXT: ret void
;
bb:
%1 = alloca ptr addrspace(4), align 8
%2 = alloca %struct.foo.0, align 1
store ptr addrspace(4) %arg, ptr %1, align 8
%3 = addrspacecast ptr %arg1 to ptr addrspace(4)
call spir_func void @bar(ptr addrspace(4) %3, ptr byval(%struct.foo.0) align 1 %2)
ret void
}

Expand Down
24 changes: 24 additions & 0 deletions sycl/test-e2e/HierPar/hier_par_indirect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,36 @@ void __attribute__((noinline)) foo(sycl::group<1> work_group) {
work_group.parallel_for_work_item([&](sycl::h_item<1> index) {});
}

void __attribute__((noinline)) bar(sycl::group<1> work_group) {
foo(work_group);
}

int main(int argc, char **argv) {
sycl::queue q;

// Try a single indirect call, two indirect calls and an indirect call
// accompanied by multiple parallel_for_work_item calls in the same work_group
// scope.
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128},
([=](sycl::group<1> wGroup) { foo(wGroup); }));
}).wait();
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for_work_group(
sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) {
foo(wGroup); // 1-layer indirect call
bar(wGroup); // 2-layer indirect call since bar calls foo
}));
}).wait();
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for_work_group(
sycl::range<1>{1}, sycl::range<1>{128}, ([=](sycl::group<1> wGroup) {
wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {});
foo(wGroup);
wGroup.parallel_for_work_item([&](sycl::h_item<1> index) {});
}));
}).wait();

std::cout << "test passed" << std::endl;
return 0;
}
Loading