Skip to content

Commit 772173f

Browse files
authored
[Clang][AMDGPU] Remove special handling for COV4 libraries (#132870)
Summary: When we were first porting to COV5, this lead to some ABI issues due to a change in how we looked up the work group size. Bitcode libraries relied on the builtins to emit code, but this was changed between versions. This prevented the bitcode libraries, like OpenMP or libc, from being used for both COV4 and COV5. The solution was to have this 'none' functionality which effectively emitted code that branched off of a global to resolve to either version. This isn't a great solution because it forced every TU to have this variable in it. The patch in #131033 removed support for COV4 from OpenMP, which was the only consumer of this functionality. Other users like HIP and OpenCL did not use this because they linked the ROCm Device Library directly which has its own handling (The name was borrowed from it after all). So, now that we don't need to worry about backward compatibility with COV4, we can remove this special handling. Users can still emit COV4 code, this simply removes the special handling used to make the OpenMP device runtime bitcode version agnostic.
1 parent c4bc1b1 commit 772173f

File tree

17 files changed

+22
-299
lines changed

17 files changed

+22
-299
lines changed

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,6 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
5757
/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
5858
/// and use its value for COV_4 or COV_5+ approach. It is used for
5959
/// compiling device libraries in an ABI-agnostic way.
60-
///
61-
/// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by
62-
/// clang during compilation of user code.
6360
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
6461
llvm::LoadInst *LD;
6562

clang/lib/CodeGen/Targets/AMDGPU.cpp

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,6 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
305305
void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
306306
CodeGenModule &CGM) const;
307307

308-
void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
309-
310308
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
311309
CodeGen::CodeGenModule &M) const override;
312310
unsigned getOpenCLKernelCallingConv() const override;
@@ -414,40 +412,6 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
414412
}
415413
}
416414

417-
/// Emits control constants used to change per-architecture behaviour in the
418-
/// AMDGPU ROCm device libraries.
419-
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
420-
CodeGen::CodeGenModule &CGM) const {
421-
StringRef Name = "__oclc_ABI_version";
422-
llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
423-
if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
424-
return;
425-
426-
if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==
427-
llvm::CodeObjectVersionKind::COV_None)
428-
return;
429-
430-
auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
431-
llvm::Constant *COV = llvm::ConstantInt::get(
432-
Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
433-
434-
// It needs to be constant weak_odr without externally_initialized so that
435-
// the load instuction can be eliminated by the IPSCCP.
436-
auto *GV = new llvm::GlobalVariable(
437-
CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
438-
nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
439-
CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
440-
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
441-
GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
442-
443-
// Replace any external references to this variable with the new global.
444-
if (OriginalGV) {
445-
OriginalGV->replaceAllUsesWith(GV);
446-
GV->takeName(OriginalGV);
447-
OriginalGV->eraseFromParent();
448-
}
449-
}
450-
451415
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
452416
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
453417
if (requiresAMDGPUProtectedVisibility(D, GV)) {
Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 3
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
22
// RUN: %clang_cc1 -cc1 -triple amdgcn-amd-amdhsa -emit-llvm -mcode-object-version=none %s -o - | FileCheck %s
33

44
//.
55
// CHECK: @__oclc_ABI_version = external addrspace(4) global i32
66
//.
77
// CHECK-LABEL: define dso_local i32 @foo(
88
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
9-
// CHECK-NEXT: entry:
9+
// CHECK-NEXT: [[ENTRY:.*:]]
1010
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
1111
// CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
1212
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4
@@ -16,8 +16,17 @@
1616
// CHECK-NEXT: [[TMP4:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
1717
// CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP4]], i32 4
1818
// CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP1]], ptr addrspace(4) [[TMP3]], ptr addrspace(4) [[TMP5]]
19-
// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 2, !range [[RNG2:![0-9]+]], !invariant.load !3, !noundef !3
19+
// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[TMP6]], align 2, !range [[RNG2:![0-9]+]], !invariant.load [[META3:![0-9]+]], !noundef [[META3]]
2020
// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP7]] to i32
2121
// CHECK-NEXT: ret i32 [[CONV]]
2222
//
2323
int foo() { return __builtin_amdgcn_workgroup_size_x(); }
24+
//.
25+
// CHECK: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
26+
// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
27+
//.
28+
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
29+
// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
30+
// CHECK: [[RNG2]] = !{i16 1, i16 1025}
31+
// CHECK: [[META3]] = !{}
32+
//.

clang/test/CodeGen/amdgpu-address-spaces.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ int [[clang::address_space(999)]] bbb = 1234;
2929
// CHECK: @u = addrspace(5) global i32 undef, align 4
3030
// CHECK: @aaa = addrspace(6) global i32 1000, align 4
3131
// CHECK: @bbb = addrspace(999) global i32 1234, align 4
32-
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
3332
//.
3433
// CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
3534
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -60,3 +59,10 @@ extern "C" [[clang::amdgpu_kernel]] void foo() {
6059
aaa = 0;
6160
bbb = 0;
6261
}
62+
//.
63+
// CHECK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
64+
//.
65+
// CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
66+
// CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
67+
// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
68+
//.

clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu

Lines changed: 0 additions & 133 deletions
This file was deleted.

clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
// RUN: -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s \
33
// RUN: | FileCheck -check-prefix=PRECOV5 %s
44

5-
65
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
76
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
87
// RUN: | FileCheck -check-prefix=COV5 %s
@@ -11,10 +10,6 @@
1110
// RUN: -fcuda-is-device -mcode-object-version=6 -emit-llvm -o - -x hip %s \
1211
// RUN: | FileCheck -check-prefix=COV5 %s
1312

14-
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
15-
// RUN: -fcuda-is-device -mcode-object-version=none -emit-llvm -o - -x hip %s \
16-
// RUN: | FileCheck -check-prefix=COVNONE %s
17-
1813
#include "Inputs/cuda.h"
1914

2015
// PRECOV5-LABEL: test_get_workgroup_size
@@ -35,35 +30,6 @@
3530
// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
3631
// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
3732

38-
39-
// COVNONE-LABEL: test_get_workgroup_size
40-
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
41-
// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
42-
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
43-
// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
44-
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
45-
// COVNONE: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
46-
// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
47-
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
48-
49-
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
50-
// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
51-
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
52-
// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
53-
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
54-
// COVNONE: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
55-
// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
56-
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
57-
58-
// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version
59-
// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
60-
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
61-
// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
62-
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
63-
// COVNONE: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
64-
// COVNONE: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
65-
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
66-
6733
__device__ void test_get_workgroup_size(int d, int *out)
6834
{
6935
switch (d) {

clang/test/CodeGenCXX/dynamic-cast-address-space.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ B fail;
1313
// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
1414
// CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
1515
// CHECK: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
16-
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
1716
//.
1817
// WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
1918
// WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8

clang/test/CodeGenHIP/default-attributes.hip

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
//.
99
// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0
1010
// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
11-
// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
1211
//.
1312
__device__ void extern_func();
1413

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ kernel void test_target_features_kernel(global int *i) {
6868
// CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0
6969
// CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
7070
// CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata"
71-
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 600
7271
//.
7372
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
7473
// NOCPU-LABEL: define {{[^@]+}}@callee

0 commit comments

Comments
 (0)