Skip to content

Commit 7104455

Browse files
committed
[Offload][CUDA] Add initial cuda_runtime.h overlay
This provides the header overlay for cuda_runtime.h which is found before any CUDA installation (none is necessary). Some basic APIs are defined in terms of the omp_target_* ones, but with the API redesign the requirements of CUDA should be taken into account. Based on: #94549
1 parent d065850 commit 7104455

File tree

5 files changed

+262
-0
lines changed

5 files changed

+262
-0
lines changed

clang/lib/Headers/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ set(llvm_offload_wrapper_files
329329
llvm_offload_wrappers/__llvm_offload.h
330330
llvm_offload_wrappers/__llvm_offload_host.h
331331
llvm_offload_wrappers/__llvm_offload_device.h
332+
llvm_offload_wrappers/cuda_runtime.h
332333
)
333334

334335
set(llvm_libc_wrapper_files
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/*===- __cuda_runtime.h - LLVM/Offload wrappers for CUDA runtime API -------===
2+
*
3+
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
* See https://llvm.org/LICENSE.txt for license information.
5+
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
*
7+
*===-----------------------------------------------------------------------===
8+
*/
9+
10+
#ifndef __CUDA_RUNTIME_API__
11+
#define __CUDA_RUNTIME_API__
12+
13+
#include <cstddef>
14+
#include <optional>
15+
16+
extern "C" {
17+
int omp_get_initial_device(void);
18+
void omp_target_free(void *Ptr, int Device);
19+
void *omp_target_alloc(size_t Size, int Device);
20+
int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
21+
size_t DstOffset, size_t SrcOffset, int DstDevice,
22+
int SrcDevice);
23+
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
24+
}
25+
26+
// TODO: There are many fields missing in this enumeration.
27+
typedef enum cudaError {
28+
cudaSuccess = 0,
29+
cudaErrorInvalidValue = 1,
30+
cudaErrorMemoryAllocation = 2,
31+
cudaErrorNoDevice = 100,
32+
cudaErrorInvalidDevice = 101,
33+
cudaErrorOTHER = -1,
34+
} cudaError_t;
35+
36+
enum cudaMemcpyKind {
37+
cudaMemcpyHostToHost = 0,
38+
cudaMemcpyHostToDevice = 1,
39+
cudaMemcpyDeviceToHost = 2,
40+
cudaMemcpyDeviceToDevice = 3,
41+
cudaMemcpyDefault = 4
42+
};
43+
44+
typedef void *cudaStream_t;
45+
46+
static thread_local cudaError_t __cudaomp_last_error = cudaSuccess;
47+
48+
// Returns the last error that has been produced and resets it to cudaSuccess.
49+
inline cudaError_t cudaGetLastError() {
50+
cudaError_t TempError = __cudaomp_last_error;
51+
__cudaomp_last_error = cudaSuccess;
52+
return TempError;
53+
}
54+
55+
// Returns the last error that has been produced without reseting it.
56+
inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
57+
58+
inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
59+
int DeviceNum = 0;
60+
*devPtr = omp_target_alloc(size, DeviceNum);
61+
if (*devPtr == NULL)
62+
return __cudaomp_last_error = cudaErrorMemoryAllocation;
63+
64+
return __cudaomp_last_error = cudaSuccess;
65+
}
66+
67+
template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
68+
return __cudaMalloc((void **)devPtr, size);
69+
}
70+
71+
inline cudaError_t __cudaFree(void *devPtr) {
72+
int DeviceNum = 0;
73+
omp_target_free(devPtr, DeviceNum);
74+
return __cudaomp_last_error = cudaSuccess;
75+
}
76+
77+
template <class T> inline cudaError_t cudaFree(T *ptr) {
78+
return __cudaFree((void *)ptr);
79+
}
80+
81+
inline cudaError_t __cudaMemcpy(void *dst, const void *src, size_t count,
82+
cudaMemcpyKind kind) {
83+
// get the host device number (which is the inital device)
84+
int HostDeviceNum = omp_get_initial_device();
85+
86+
// use the default device for gpu
87+
int GPUDeviceNum = 0;
88+
89+
// default to copy from host to device
90+
int DstDeviceNum = GPUDeviceNum;
91+
int SrcDeviceNum = HostDeviceNum;
92+
93+
if (kind == cudaMemcpyDeviceToHost)
94+
std::swap(DstDeviceNum, SrcDeviceNum);
95+
96+
// omp_target_memcpy returns 0 on success and non-zero on failure
97+
if (omp_target_memcpy(dst, src, count, 0, 0, DstDeviceNum, SrcDeviceNum))
98+
return __cudaomp_last_error = cudaErrorInvalidValue;
99+
return __cudaomp_last_error = cudaSuccess;
100+
}
101+
102+
template <class T>
103+
inline cudaError_t cudaMemcpy(T *dst, const T *src, size_t count,
104+
cudaMemcpyKind kind) {
105+
return __cudaMemcpy((void *)dst, (const void *)src, count, kind);
106+
}
107+
108+
inline cudaError_t __cudaMemset(void *devPtr, int value, size_t count,
109+
cudaStream_t stream = 0) {
110+
int DeviceNum = 0;
111+
if (!omp_target_memset(devPtr, value, count, DeviceNum))
112+
return __cudaomp_last_error = cudaErrorInvalidValue;
113+
return __cudaomp_last_error = cudaSuccess;
114+
}
115+
116+
template <class T>
117+
inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
118+
return __cudaMemset((void *)devPtr, value, count);
119+
}
120+
121+
inline cudaError_t cudaDeviceSynchronize() {
122+
// TODO: not implemented, not async yet.
123+
return __cudaomp_last_error = cudaSuccess;
124+
}
125+
126+
inline cudaError_t cudaDeviceReset(void) {
127+
// TODO: not implemented.
128+
return __cudaomp_last_error = cudaSuccess;
129+
}
130+
131+
#endif
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
2+
// RUN: %t | %fcheck-generic
3+
4+
// UNSUPPORTED: aarch64-unknown-linux-gnu
5+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
6+
// UNSUPPORTED: x86_64-pc-linux-gnu
7+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
8+
9+
#include <cuda_runtime.h>
10+
#include <stdio.h>
11+
12+
extern "C" {
13+
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
14+
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
15+
}
16+
17+
__global__ void kernel(int *A, int *DevPtr, int N) {
18+
for (int i = 0; i < N; ++i)
19+
DevPtr[i] = 1;
20+
for (int i = 0; i < N; ++i)
21+
*A += DevPtr[i];
22+
}
23+
24+
int main(int argc, char **argv) {
25+
int DevNo = 0;
26+
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
27+
int *DevPtr;
28+
auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
29+
if (Err != cudaSuccess)
30+
return -1;
31+
*Ptr = 0;
32+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
33+
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
34+
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
35+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
36+
// CHECK: Ptr [[Ptr]], *Ptr: 42
37+
Err = cudaFree(DevPtr);
38+
if (Err != cudaSuccess)
39+
return -1;
40+
llvm_omp_target_free_shared(Ptr, DevNo);
41+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
2+
// RUN: %t | %fcheck-generic
3+
4+
// UNSUPPORTED: aarch64-unknown-linux-gnu
5+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
6+
// UNSUPPORTED: x86_64-pc-linux-gnu
7+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
8+
9+
#include <cuda_runtime.h>
10+
#include <stdio.h>
11+
12+
__global__ void kernel(int *DevPtr, int N) {
13+
for (int i = 0; i < N; ++i)
14+
DevPtr[i]--;
15+
}
16+
17+
int main(int argc, char **argv) {
18+
int DevNo = 0;
19+
int Res = 0;
20+
int *DevPtr;
21+
auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
22+
if (Err != cudaSuccess)
23+
return -1;
24+
int HstPtr[42];
25+
for (int i = 0; i < 42; ++i) {
26+
HstPtr[i] = 2;
27+
}
28+
Err = cudaMemcpy(DevPtr, HstPtr, 42 * sizeof(int), cudaMemcpyHostToDevice);
29+
if (Err != cudaSuccess)
30+
return -1;
31+
printf("Res: %i\n", Res);
32+
// CHECK: Res: 0
33+
kernel<<<1, 1>>>(DevPtr, 42);
34+
Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
35+
if (Err != cudaSuccess)
36+
return -1;
37+
for (int i = 0; i < 42; ++i) {
38+
printf("%i : %i\n", i, HstPtr[i]);
39+
Res += HstPtr[i];
40+
}
41+
printf("Res: %i\n", Res);
42+
// CHECK: Res: 42
43+
Err = cudaFree(DevPtr);
44+
if (Err != cudaSuccess)
45+
return -1;
46+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// RUN: %clang++ -foffload-via-llvm --offload-arch=native %s -o %t
2+
// RUN: %t | %fcheck-generic
3+
4+
// UNSUPPORTED: aarch64-unknown-linux-gnu
5+
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
6+
// UNSUPPORTED: x86_64-pc-linux-gnu
7+
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
8+
9+
#include <cuda_runtime.h>
10+
#include <stdio.h>
11+
12+
extern "C" {
13+
void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
14+
void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
15+
}
16+
17+
__global__ void kernel(int *A, int *DevPtr, int N) {
18+
for (int i = 0; i < N; ++i)
19+
*A += DevPtr[i];
20+
*A *= -1;
21+
}
22+
23+
int main(int argc, char **argv) {
24+
int DevNo = 0;
25+
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
26+
int *DevPtr;
27+
auto Err = cudaMalloc(&DevPtr, 42 * sizeof(int));
28+
if (Err != cudaSuccess)
29+
return -1;
30+
Err = cudaMemset(DevPtr, -1, 42 * sizeof(int));
31+
if (Err != cudaSuccess)
32+
return -1;
33+
*Ptr = 0;
34+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
35+
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
36+
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
37+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
38+
// CHECK: Ptr [[Ptr]], *Ptr: 42
39+
Err = cudaFree(DevPtr);
40+
if (Err != cudaSuccess)
41+
return -1;
42+
llvm_omp_target_free_shared(Ptr, DevNo);
43+
}

0 commit comments

Comments
 (0)