Skip to content

[SYCL] Handle address space casts in the LowerWGScope #3120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,8 @@ static bool mayHaveSideEffects(const Instruction *I) {
case Instruction::Call:
assert(!isPFWICall(I) && "pfwi must have been handled separately");
return true;
case Instruction::AddrSpaceCast:
return false;
default:
return true;
}
Expand Down Expand Up @@ -630,6 +632,10 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
// whether it is an alloca with "work_item_scope"
SmallVector<CaptureDesc, 4> PrivMemCaptures;

// Look through cast
if (auto *Cast = dyn_cast<AddrSpaceCastInst>(LambdaObj))
LambdaObj = Cast->getOperand(0);

for (auto *U : LambdaObj->users()) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U);

Expand Down Expand Up @@ -773,19 +779,21 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, const llvm::Triple &TT,
Instruction *First = nullptr;
Instruction *Last = nullptr;

// Skip PHIs and allocas, as they don't have side effects and must never be
// guarded with the WG leader test. Note that there should be no allocas in
// local address space at this point - they must have been converted to
// globals.
// Skip PHIs, allocas and addrspacecasts associated with allocas, as they
// don't have side effects and must never be guarded with the WG leader
// test. Note that there should be no allocas in local address space at this
// point - they must have been converted to globals.
Instruction *I = BB.getFirstNonPHI();

for (; I->getOpcode() == Instruction::Alloca; I = I->getNextNode()) {
for (; I->getOpcode() == Instruction::Alloca ||
I->getOpcode() == Instruction::AddrSpaceCast;
I = I->getNextNode()) {
auto *AllocaI = dyn_cast<AllocaInst>(I);
// Allocas marked with "work_item_scope" are those originating from
// cl::sycl::private_memory<T> variables, which must be in private memory.
// No shadows/materialization is needed for them because they can be
// updated only within PFWIs
if (!AllocaI->getMetadata(WI_SCOPE_MD))
if (AllocaI && !AllocaI->getMetadata(WI_SCOPE_MD))
Allocas.insert(AllocaI);
}
for (; I && (I != BB.getTerminator()); I = I->getNextNode()) {
Expand Down
87 changes: 87 additions & 0 deletions llvm/test/SYCLLowerIR/addrspacecast_handling.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -LowerWGScope -S | FileCheck %s

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a comment what this test checks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comment.

; Test checks that LowerWGScope pass can deal with addrpacecasts
; associated with allocas and look through addrspacecasts when handling
; private_memory<T> captures.

%struct.ham = type { i64, i64, i32, i32 }
%struct.bar = type { i64 }
%struct.spam = type { i64, i64, i64, i64, i32 }

; CHECK: @[[SHADOW4:.*]] = internal unnamed_addr addrspace(3) global %struct.ham addrspace(4)*
; CHECK: @[[SHADOW3:.*]] = internal unnamed_addr addrspace(3) global %struct.spam
; CHECK: @[[SHADOW2:.*]] = internal unnamed_addr addrspace(3) global %struct.ham
; CHECK: @[[SHADOW1:.*]] = internal unnamed_addr addrspace(3) global %struct.bar

define linkonce_odr dso_local spir_func void @foo(%struct.ham addrspace(4)* dereferenceable_or_null(56) %arg, %struct.bar* byval(%struct.bar) align 8 %arg1) !work_group_scope !0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_HAM:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0:#.*]]
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.bar* [[ARG1:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 bitcast (%struct.bar addrspace(3)* @[[SHADOW1]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 8, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[STRUCT_HAM]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.ham addrspace(3)* @[[SHADOW2]] to i8 addrspace(3)*), i8 addrspace(4)* align 8 [[TMP2]], i64 24, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.bar* [[ARG1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 8 bitcast (%struct.bar addrspace(3)* @[[SHADOW1]] to i8 addrspace(3)*), i64 8, i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_HAM]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 16 bitcast (%struct.ham addrspace(3)* @[[SHADOW2]] to i8 addrspace(3)*), i64 24, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast [[STRUCT_HAM]] addrspace(4)** [[TMP]] to [[STRUCT_HAM]] addrspace(4)* addrspace(4)*
; CHECK-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_SPAM:%.*]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.spam* [[TMP3]] to [[STRUCT_SPAM]] addrspace(4)*
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[ARG]], [[STRUCT_HAM]] addrspace(4)* addrspace(4)* [[TMP2]], align 8
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP6]], 0
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
; CHECK: TestMat:
; CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.spam* [[TMP3]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW3]] to i8 addrspace(3)*), i8* align 8 [[TMP7]], i64 36, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD]], [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @[[SHADOW4]], align 8
; CHECK-NEXT: br label [[LEADERMAT]]
; CHECK: LeaderMat:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @[[SHADOW4]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD1]], [[STRUCT_HAM]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = bitcast %struct.spam* [[TMP3]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP8]], i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW3]] to i8 addrspace(3)*), i64 36, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast %struct.bar* [[ARG1]] to [[STRUCT_BAR:%.*]] addrspace(4)*
; CHECK-NEXT: [[TMP6:%.*]] = addrspacecast [[STRUCT_SPAM]] addrspace(4)* [[TMP4]] to %struct.spam*
; CHECK-NEXT: call spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) [[TMP5]], %struct.spam* byval(%struct.spam) align 8 [[TMP6]])
; CHECK-NEXT: ret void
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit:
Can the number of checks be reduced to addrspace cast dependence chain only?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's important to validate all the code generated by the pass to ensure correctness.
These checks should be easy to maintain as they are auto-generated (see the note at line 1).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Alexey here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's important to validate all the code generated by the pass to ensure correctness.

I think regression tests are written for particular fixes and should be focused on testing the fix, not entire pass/feature, as there should exist other tests. But anyway, that was a nit, so either way is fine with me.
The script for automated maintenance is a good argument too.

;
bb:
%tmp = alloca %struct.ham addrspace(4)*, align 8
%tmp2 = addrspacecast %struct.ham addrspace(4)** %tmp to %struct.ham addrspace(4)* addrspace(4)*
%tmp3 = alloca %struct.spam, align 8
%tmp4 = addrspacecast %struct.spam* %tmp3 to %struct.spam addrspace(4)*
store %struct.ham addrspace(4)* %arg, %struct.ham addrspace(4)* addrspace(4)* %tmp2, align 8
%tmp5 = addrspacecast %struct.bar* %arg1 to %struct.bar addrspace(4)*
%tmp6 = addrspacecast %struct.spam addrspace(4)* %tmp4 to %struct.spam*
call spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) %tmp5, %struct.spam* byval(%struct.spam) align 8 %tmp6)
ret void
}

define linkonce_odr dso_local spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) %arg, %struct.spam* byval(%struct.spam) align 8 %arg1) !work_item_scope !0 !parallel_for_work_item !0 {
bb:
ret void
}

!0 = !{}
49 changes: 23 additions & 26 deletions llvm/test/SYCLLowerIR/pfwg_and_pfwi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
%struct.foo = type { %struct.barney }
%struct.foo.0 = type { i8 }

; CHECK: @[[GROUP_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.zot addrspace(4)*
; CHECK: @[[PFWG_SHADOW_PTR:.*]] = internal unnamed_addr addrspace(3) global %struct.bar addrspace(4)*
; CHECK: @[[PFWI_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.foo.0
; CHECK: @[[PFWG_SHADOW:.*]] = internal unnamed_addr addrspace(3) global %struct.bar
Expand All @@ -23,53 +22,51 @@ define internal spir_func void @wibble(%struct.bar addrspace(4)* %arg, %struct.z
; CHECK-LABEL: @wibble(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_BAR:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP2:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[TMP_FOO:%.*]] = alloca [[STRUCT_FOO_0:%.*]], align 1
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0:#.*]]
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.zot* [[ARG1:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 96, i1 false)
; CHECK-NEXT: [[ARG_CAST:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i8 addrspace(4)* align 8 [[ARG_CAST]], i64 1, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i8 addrspace(4)* align 8 [[TMP2]], i64 1, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.zot* [[ARG1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 16 bitcast (%struct.zot addrspace(3)* @[[GROUP_SHADOW]] to i8 addrspace(3)*), i64 96, i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_BAR]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.bar, [[STRUCT_BAR]] addrspace(3)* @[[PFWG_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[ARG]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.zot* [[ARG1]] to [[STRUCT_ZOT:%.*]] addrspace(4)*
; CHECK-NEXT: store [[STRUCT_ZOT]] addrspace(4)* [[TMP4]], [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272)
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP4]], 0
; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP6]], 0
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
; CHECK: TestMat:
; CHECK-NEXT: [[TMP5:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP5]], i64 1, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]]
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]]
; CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.foo.0* [[TMP_FOO]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i8* align 1 [[TMP7]], i64 1, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD]], [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]], align 8
; CHECK-NEXT: br label [[LEADERMAT]]
; CHECK: LeaderMat:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]]
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast %struct.foo.0* [[TMP2]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP6]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #0
; CHECK-NEXT: [[WG_VAL_TMP4:%.*]] = load [[STRUCT_ZOT]] addrspace(4)*, [[STRUCT_ZOT]] addrspace(4)* addrspace(3)* @[[GROUP_SHADOW_PTR]]
; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[WG_VAL_TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP2]])
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_BAR]] addrspace(4)*, [[STRUCT_BAR]] addrspace(4)* addrspace(3)* @[[PFWG_SHADOW_PTR]], align 8
; CHECK-NEXT: store [[STRUCT_BAR]] addrspace(4)* [[MAT_LD1]], [[STRUCT_BAR]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = bitcast %struct.foo.0* [[TMP_FOO]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 1 [[TMP8]], i8 addrspace(3)* align 8 getelementptr inbounds (%struct.foo.0, [[STRUCT_FOO_0]] addrspace(3)* @[[PFWI_SHADOW]], i32 0, i32 0), i64 1, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.zot* [[ARG1]] to [[STRUCT_ZOT:%.*]] addrspace(4)*
; CHECK-NEXT: call spir_func void @bar(%struct.zot addrspace(4)* [[TMP4]], %struct.foo.0* byval(%struct.foo.0) align 1 [[TMP_FOO]])
; CHECK-NEXT: ret void
;
bb:
Expand Down