Skip to content

Commit 11cc826

Browse files
authored
[Clang] Implement resource directory headers for common GPU intrinsics (#110179)
Summary: All GPU based languages provide some way to access things like the thread ID or other resources. However, this is spread between many different languages and it varies between targets. The goal here is to provide a resource directory header that just provides these in an easier to understand way, primarily so this can be used for C/C++ code. The interface aims to be common, to faciliate easier porting, but target specific stuff could be put in the individual headers.
1 parent 2778af9 commit 11cc826

File tree

6 files changed

+784
-0
lines changed

6 files changed

+784
-0
lines changed

clang/lib/Headers/CMakeLists.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,12 @@ set(x86_files
276276
cpuid.h
277277
)
278278

279+
set(gpu_files
280+
gpuintrin.h
281+
nvptxintrin.h
282+
amdgpuintrin.h
283+
)
284+
279285
set(windows_only_files
280286
intrin0.h
281287
intrin.h
@@ -304,6 +310,7 @@ set(files
304310
${systemz_files}
305311
${ve_files}
306312
${x86_files}
313+
${gpu_files}
307314
${webassembly_files}
308315
${windows_only_files}
309316
${utility_files}
@@ -526,6 +533,7 @@ add_header_target("systemz-resource-headers" "${systemz_files};${zos_wrapper_fil
526533
add_header_target("ve-resource-headers" "${ve_files}")
527534
add_header_target("webassembly-resource-headers" "${webassembly_files}")
528535
add_header_target("x86-resource-headers" "${x86_files}")
536+
add_header_target("gpu-resource-headers" "${gpu_files}")
529537

530538
# Other header groupings
531539
add_header_target("hlsl-resource-headers" ${hlsl_files})
@@ -712,6 +720,12 @@ install(
712720
EXCLUDE_FROM_ALL
713721
COMPONENT x86-resource-headers)
714722

723+
install(
724+
FILES ${gpu_files}
725+
DESTINATION ${header_install_dir}
726+
EXCLUDE_FROM_ALL
727+
COMPONENT gpu-resource-headers)
728+
715729
if(NOT CLANG_ENABLE_HLSL)
716730
set(EXCLUDE_HLSL EXCLUDE_FROM_ALL)
717731
endif()

clang/lib/Headers/amdgpuintrin.h

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
//===-- amdgpuintrin.h - AMDPGU intrinsic functions -----------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef __AMDGPUINTRIN_H
10+
#define __AMDGPUINTRIN_H
11+
12+
#ifndef __AMDGPU__
13+
#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
14+
#endif
15+
16+
#include <stdint.h>
17+
18+
#if !defined(__cplusplus)
19+
_Pragma("push_macro(\"bool\")");
20+
#define bool _Bool
21+
#endif
22+
23+
_Pragma("omp begin declare target device_type(nohost)");
24+
_Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
25+
26+
// Type aliases to the address spaces used by the AMDGPU backend.
27+
#define __gpu_private __attribute__((opencl_private))
28+
#define __gpu_constant __attribute__((opencl_constant))
29+
#define __gpu_local __attribute__((opencl_local))
30+
#define __gpu_global __attribute__((opencl_global))
31+
#define __gpu_generic __attribute__((opencl_generic))
32+
33+
// Attribute to declare a function as a kernel.
34+
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
35+
36+
// Returns the number of workgroups in the 'x' dimension of the grid.
37+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
38+
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
39+
}
40+
41+
// Returns the number of workgroups in the 'y' dimension of the grid.
42+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) {
43+
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
44+
}
45+
46+
// Returns the number of workgroups in the 'z' dimension of the grid.
47+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) {
48+
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
49+
}
50+
51+
// Returns the 'x' dimension of the current AMD workgroup's id.
52+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_x(void) {
53+
return __builtin_amdgcn_workgroup_id_x();
54+
}
55+
56+
// Returns the 'y' dimension of the current AMD workgroup's id.
57+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_y(void) {
58+
return __builtin_amdgcn_workgroup_id_y();
59+
}
60+
61+
// Returns the 'z' dimension of the current AMD workgroup's id.
62+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) {
63+
return __builtin_amdgcn_workgroup_id_z();
64+
}
65+
66+
// Returns the number of workitems in the 'x' dimension.
67+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) {
68+
return __builtin_amdgcn_workgroup_size_x();
69+
}
70+
71+
// Returns the number of workitems in the 'y' dimension.
72+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) {
73+
return __builtin_amdgcn_workgroup_size_y();
74+
}
75+
76+
// Returns the number of workitems in the 'z' dimension.
77+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) {
78+
return __builtin_amdgcn_workgroup_size_z();
79+
}
80+
81+
// Returns the 'x' dimension id of the workitem in the current AMD workgroup.
82+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_x(void) {
83+
return __builtin_amdgcn_workitem_id_x();
84+
}
85+
86+
// Returns the 'y' dimension id of the workitem in the current AMD workgroup.
87+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_y(void) {
88+
return __builtin_amdgcn_workitem_id_y();
89+
}
90+
91+
// Returns the 'z' dimension id of the workitem in the current AMD workgroup.
92+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id_z(void) {
93+
return __builtin_amdgcn_workitem_id_z();
94+
}
95+
96+
// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
97+
// and compilation options.
98+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_lanes(void) {
99+
return __builtin_amdgcn_wavefrontsize();
100+
}
101+
102+
// Returns the id of the thread inside of an AMD wavefront executing together.
103+
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_id(void) {
104+
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
105+
}
106+
107+
// Returns the bit-mask of active threads in the current wavefront.
108+
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_lane_mask(void) {
109+
return __builtin_amdgcn_read_exec();
110+
}
111+
112+
// Copies the value from the first active thread in the wavefront to the rest.
113+
_DEFAULT_FN_ATTRS static __inline__ uint32_t
114+
__gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115+
return __builtin_amdgcn_readfirstlane(__x);
116+
}
117+
118+
// Copies the value from the first active thread in the wavefront to the rest.
119+
_DEFAULT_FN_ATTRS __inline__ uint64_t
120+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121+
uint32_t __hi = (uint32_t)(__x >> 32ull);
122+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123+
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124+
((uint64_t)__builtin_amdgcn_readfirstlane(__lo));
125+
}
126+
127+
// Returns a bitmask of threads in the current lane for which \p x is true.
128+
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
129+
bool __x) {
130+
// The lane_mask & gives the nvptx semantics when lane_mask is a subset of
131+
// the active threads
132+
return __lane_mask & __builtin_amdgcn_ballot_w64(__x);
133+
}
134+
135+
// Waits for all the threads in the block to converge and issues a fence.
136+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_threads(void) {
137+
__builtin_amdgcn_s_barrier();
138+
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
139+
}
140+
141+
// Wait for all threads in the wavefront to converge, this is a noop on AMDGPU.
142+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
143+
__builtin_amdgcn_wave_barrier();
144+
}
145+
146+
// Shuffles the the lanes inside the wavefront according to the given index.
147+
_DEFAULT_FN_ATTRS static __inline__ uint32_t
148+
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
149+
return __builtin_amdgcn_ds_bpermute(__idx << 2, __x);
150+
}
151+
152+
// Shuffles the the lanes inside the wavefront according to the given index.
153+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
154+
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x) {
155+
uint32_t __hi = (uint32_t)(__x >> 32ull);
156+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
157+
return ((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __hi) << 32ull) |
158+
((uint64_t)__builtin_amdgcn_ds_bpermute(__idx << 2, __lo));
159+
}
160+
161+
// Returns true if the flat pointer points to CUDA 'shared' memory.
162+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
163+
return __builtin_amdgcn_is_shared(
164+
(void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr));
165+
}
166+
167+
// Returns true if the flat pointer points to CUDA 'local' memory.
168+
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_private(void *ptr) {
169+
return __builtin_amdgcn_is_private(
170+
(void __attribute__((address_space(0))) *)((void __gpu_generic *)ptr));
171+
}
172+
173+
// Terminates execution of the associated wavefront.
174+
_DEFAULT_FN_ATTRS [[noreturn]] static __inline__ void __gpu_exit(void) {
175+
__builtin_amdgcn_endpgm();
176+
}
177+
178+
// Suspend the thread briefly to assist the scheduler during busy loops.
179+
_DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
180+
__builtin_amdgcn_s_sleep(2);
181+
}
182+
183+
_Pragma("omp end declare variant");
184+
_Pragma("omp end declare target");
185+
186+
#if !defined(__cplusplus)
187+
_Pragma("pop_macro(\"bool\")");
188+
#endif
189+
190+
#endif // __AMDGPUINTRIN_H

0 commit comments

Comments
 (0)