Skip to content

Commit 705de43

Browse files
committed
[Offload] Introduce the concept of "default streams"
The offload APIs, and the CUDA wrappers in clang, now support "default streams" per thread (and per device). It should be per context but we don't really expose that concept yet. The KernelArguments allow an LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin dependent and can later also be created outside or queried from the runtime. User managed "queues" are kept persistent, thus not returned to the pool once synchronized. The CUDA tests will synchronize via `cudaDeviceSynchronize` before checking the results. Based on: #94821
1 parent 7104455 commit 705de43

File tree

19 files changed

+143
-28
lines changed

19 files changed

+143
-28
lines changed

clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define __CUDA_RUNTIME_API__
1212

1313
#include <cstddef>
14+
#include <cstdint>
1415
#include <optional>
1516

1617
extern "C" {
@@ -21,6 +22,8 @@ int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
2122
size_t DstOffset, size_t SrcOffset, int DstDevice,
2223
int SrcDevice);
2324
void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
25+
int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
26+
void *AsyncInfoQueue);
2427
}
2528

2629
// TODO: There are many fields missing in this enumeration.
@@ -55,7 +58,15 @@ inline cudaError_t cudaGetLastError() {
5558
// Returns the last error that has been produced without reseting it.
5659
inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
5760

61+
inline cudaError_t cudaDeviceSynchronize() {
62+
int DeviceNum = 0;
63+
return __cudaomp_last_error =
64+
(cudaError_t)__tgt_target_synchronize_async_info_queue(
65+
/*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
66+
}
67+
5868
inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
69+
cudaDeviceSynchronize();
5970
int DeviceNum = 0;
6071
*devPtr = omp_target_alloc(size, DeviceNum);
6172
if (*devPtr == NULL)
@@ -69,6 +80,7 @@ template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
6980
}
7081

7182
inline cudaError_t __cudaFree(void *devPtr) {
83+
cudaDeviceSynchronize();
7284
int DeviceNum = 0;
7385
omp_target_free(devPtr, DeviceNum);
7486
return __cudaomp_last_error = cudaSuccess;
@@ -118,12 +130,8 @@ inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
118130
return __cudaMemset((void *)devPtr, value, count);
119131
}
120132

121-
inline cudaError_t cudaDeviceSynchronize() {
122-
// TODO: not implemented, not async yet.
123-
return __cudaomp_last_error = cudaSuccess;
124-
}
125-
126133
inline cudaError_t cudaDeviceReset(void) {
134+
cudaDeviceSynchronize();
127135
// TODO: not implemented.
128136
return __cudaomp_last_error = cudaSuccess;
129137
}

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ enum class IdentFlag {
7272
#include "llvm/Frontend/OpenMP/OMPKinds.def"
7373

7474
// Version of the kernel argument format used by the omp runtime.
75-
#define OMP_KERNEL_ARG_VERSION 3
75+
#define OMP_KERNEL_ARG_VERSION 4
7676

7777
// Minimum version of the compiler that generates a kernel dynamic pointer.
7878
#define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3

llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
9090
__OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
9191
__OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
9292
VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
93-
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
93+
Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
9494
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
9595
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
9696
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
496496
auto Int32Ty = Type::getInt32Ty(Builder.getContext());
497497
Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
498498
Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
499+
Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());
499500

500501
Value *NumTeams3D =
501502
Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
@@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
514515
Flags,
515516
NumTeams3D,
516517
NumThreads3D,
517-
KernelArgs.DynCGGroupMem};
518+
KernelArgs.DynCGGroupMem,
519+
AsyncInfoQueue};
518520
}
519521

520522
void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {

offload/include/Shared/APITypes.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ struct __tgt_async_info {
8585
/// ensure it is a valid location while the transfer to the device is
8686
/// happening.
8787
KernelLaunchEnvironmentTy KernelLaunchEnvironment;
88+
89+
/// Flag to indicate the Queue should be persistent.
90+
bool PersistentQueue = false;
8891
};
8992

9093
/// This struct contains all of the arguments to a target kernel region launch.
@@ -110,12 +113,16 @@ struct KernelArgsTy {
110113
// The number of threads (for x,y,z dimension).
111114
uint32_t ThreadLimit[3] = {0, 0, 0};
112115
uint32_t DynCGroupMem = 0; // Amount of dynamic cgroup memory requested.
116+
// A __tgt_async_info queue pointer to be used for the kernel and all
117+
// associated device interactions. The operations are implicitly made
118+
// non-blocking.
119+
void *AsyncInfoQueue = nullptr;
113120
};
114121
static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
115122
"Invalid struct size");
116123
static_assert(sizeof(KernelArgsTy) ==
117124
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
118-
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
125+
5 * sizeof(void **) + 2 * sizeof(int64_t *)),
119126
"Invalid struct size");
120127

121128
/// Flat array of kernel launch parameters and their total size.

offload/include/omptarget.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,19 @@ class AsyncInfoTy {
136136
/// Synchronization method to be used.
137137
SyncTy SyncType;
138138

139-
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
139+
AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
140140
: Device(Device), SyncType(SyncType) {}
141+
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
142+
: Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
143+
AsyncInfo.Queue = AsyncInfoQueue;
144+
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
145+
}
146+
AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
147+
: Device(Device), SyncType(SyncType) {
148+
AsyncInfo.Queue = AsyncInfoQueue;
149+
AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
150+
}
151+
141152
~AsyncInfoTy() { synchronize(); }
142153

143154
/// Implicit conversion to the __tgt_async_info which is used in the
@@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
207218
void **TaskAsyncInfoPtr = nullptr;
208219

209220
public:
210-
TaskAsyncInfoWrapperTy(DeviceTy &Device)
221+
TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue= nullptr)
211222
: ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
223+
assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
212224
// If we failed to acquired the current global thread id, we cannot
213225
// re-enqueue the current task. Thus we should use the local blocking async
214226
// info.
@@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
425437
void *VAddr, bool IsRecord, bool SaveOutput,
426438
uint64_t &ReqPtrArgOffset);
427439

440+
void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);
441+
428442
#ifdef __cplusplus
429443
}
430444
#endif

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2208,8 +2208,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22082208
return Err;
22092209

22102210
// Once the stream is synchronized, return it to stream pool and reset
2211-
// AsyncInfo. This is to make sure the synchronization only works for its
2212-
// own tasks.
2211+
// AsyncInfo if the queue is not persistent. This is to make sure the
2212+
// synchronization only works for its own tasks.
2213+
if (AsyncInfo.PersistentQueue)
2214+
return Plugin::success();
2215+
22132216
AsyncInfo.Queue = nullptr;
22142217
return AMDGPUStreamManager.returnResource(Stream);
22152218
}
@@ -2228,9 +2231,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22282231
if (!(*CompletedOrErr))
22292232
return Plugin::success();
22302233

2231-
// Once the stream is completed, return it to stream pool and reset
2232-
// AsyncInfo. This is to make sure the synchronization only works for its
2233-
// own tasks.
2234+
// Once the stream is synchronized, return it to stream pool and reset
2235+
// AsyncInfo if the queue is not persistent. This is to make sure the
2236+
// synchronization only works for its own tasks.
2237+
if (AsyncInfo.PersistentQueue)
2238+
return Plugin::success();
2239+
22342240
AsyncInfo.Queue = nullptr;
22352241
return AMDGPUStreamManager.returnResource(Stream);
22362242
}
@@ -2443,7 +2449,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
24432449

24442450
/// Initialize the async info for interoperability purposes.
24452451
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
2446-
// TODO: Implement this function.
2452+
AMDGPUStreamTy *Stream;
2453+
if (auto Err = getStream(AsyncInfoWrapper, Stream))
2454+
return Err;
2455+
24472456
return Plugin::success();
24482457
}
24492458

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,8 +1435,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
14351435

14361436
Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
14371437
assert(AsyncInfoPtr && "Invalid async info");
1438+
assert(!(*AsyncInfoPtr) && "Already initialized async info");
14381439

14391440
*AsyncInfoPtr = new __tgt_async_info();
1441+
(*AsyncInfoPtr)->PersistentQueue = true;
14401442

14411443
AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
14421444

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
643643
}
644644

645645
// Once the stream is synchronized, return it to stream pool and reset
646-
// AsyncInfo. This is to make sure the synchronization only works for its
647-
// own tasks.
646+
// AsyncInfo if the queue is not persistent. This is to make sure the
647+
// synchronization only works for its own tasks.
648+
if (AsyncInfo.PersistentQueue)
649+
return Plugin::success();
650+
648651
AsyncInfo.Queue = nullptr;
649652
if (auto Err = CUDAStreamManager.returnResource(Stream))
650653
return Err;
@@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
777780
if (Res == CUDA_ERROR_NOT_READY)
778781
return Plugin::success();
779782

780-
// Once the stream is synchronized and the operations completed (or an error
781-
// occurs), return it to stream pool and reset AsyncInfo. This is to make
782-
// sure the synchronization only works for its own tasks.
783+
// Once the stream is synchronized, return it to stream pool and reset
784+
// AsyncInfo if the queue is not persistent. This is to make sure the
785+
// synchronization only works for its own tasks.
786+
if (AsyncInfo.PersistentQueue)
787+
return Plugin::success();
788+
783789
AsyncInfo.Queue = nullptr;
784790
if (auto Err = CUDAStreamManager.returnResource(Stream))
785791
return Err;

offload/src/KernelLanguage/API.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
//
99
//===----------------------------------------------------------------------===//
1010

11+
#include "llvm/Frontend/OpenMP/OMPConstants.h"
1112

1213
#include "Shared/APITypes.h"
1314

15+
#include <cstdint>
1416
#include <cstdio>
1517

1618
struct dim3 {
@@ -56,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
5658
int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
5759
int32_t ThreadLimit, const void *HostPtr,
5860
KernelArgsTy *Args);
61+
void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);
5962

6063
unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6164
void *args, size_t sharedMem, void *stream) {
65+
int64_t DeviceNo = 0;
6266
KernelArgsTy Args = {};
67+
Args.Version = OMP_KERNEL_ARG_VERSION;
6368
Args.DynCGroupMem = sharedMem;
6469
Args.NumTeams[0] = gridDim.x;
6570
Args.NumTeams[1] = gridDim.y;
@@ -69,8 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6974
Args.ThreadLimit[2] = blockDim.z;
7075
Args.ArgPtrs = reinterpret_cast<void **>(args);
7176
Args.Flags.IsCUDA = true;
72-
int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
73-
blockDim.x, func, &Args);
77+
if (stream)
78+
Args.AsyncInfoQueue = stream;
79+
else
80+
Args.AsyncInfoQueue =
81+
__tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
82+
int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
83+
&Args);
7484
return rv;
7585
}
7686
}

offload/src/exports

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ VERS1.0 {
2929
__tgt_target_kernel;
3030
__tgt_target_kernel_nowait;
3131
__tgt_target_nowait_query;
32+
__tgt_target_get_default_async_info_queue;
33+
__tgt_target_synchronize_async_info_queue;
3234
__tgt_target_kernel_replay;
3335
__tgt_activate_record_replay;
3436
__tgt_mapper_num_components;

offload/src/interface.cpp

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
#include "OpenMP/OMPT/Interface.h"
1515
#include "OpenMP/OMPT/Callback.h"
1616
#include "PluginManager.h"
17+
#include "Shared/APITypes.h"
18+
#include "omptarget.h"
1719
#include "private.h"
1820

1921
#include "Shared/EnvironmentVar.h"
@@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
312314
if (!DeviceOrErr)
313315
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
314316

315-
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
317+
TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue);
316318
AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
317319
/// RAII to establish tool anchors before and after target region
318320
OMPT_IF_BUILT(InterfaceRAII TargetRAII(
@@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
510512
delete AsyncInfo;
511513
*AsyncHandle = nullptr;
512514
}
515+
516+
EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc,
517+
int64_t DeviceId) {
518+
assert(PM && "Runtime not initialized");
519+
520+
static thread_local void **AsyncInfoQueue = nullptr;
521+
522+
if (!AsyncInfoQueue)
523+
AsyncInfoQueue = reinterpret_cast<void **>(
524+
calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0])));
525+
526+
if (!AsyncInfoQueue[DeviceId]) {
527+
auto DeviceOrErr = PM->getDevice(DeviceId);
528+
if (!DeviceOrErr)
529+
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
530+
531+
__tgt_async_info *AsyncInfo = nullptr;
532+
DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo);
533+
AsyncInfoQueue[DeviceId] = AsyncInfo->Queue;
534+
}
535+
536+
return AsyncInfoQueue[DeviceId];
537+
}
538+
539+
EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc,
540+
int64_t DeviceId,
541+
void *AsyncInfoQueue) {
542+
assert(PM && "Runtime not initialized");
543+
544+
auto DeviceOrErr = PM->getDevice(DeviceId);
545+
if (!DeviceOrErr)
546+
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
547+
if (!AsyncInfoQueue)
548+
AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId);
549+
AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue,
550+
AsyncInfoTy::SyncTy::BLOCKING);
551+
552+
if (AsyncInfo.synchronize())
553+
FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
554+
[[maybe_unused]] __tgt_async_info *ASI = AsyncInfo;
555+
assert(ASI->Queue);
556+
assert(ASI->Queue && ASI->PersistentQueue);
557+
558+
return 0;
559+
}

offload/src/omptarget.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() {
4949
case SyncTy::BLOCKING:
5050
// If we have a queue we need to synchronize it now.
5151
Result = Device.synchronize(*this);
52-
assert(AsyncInfo.Queue == nullptr &&
52+
assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) &&
5353
"The device plugin should have nulled the queue to indicate there "
5454
"are no outstanding actions!");
5555
break;

offload/test/offloading/CUDA/basic_api_malloc_free.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ int main(int argc, char **argv) {
3232
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3333
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
3434
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
35-
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3635
// CHECK: Ptr [[Ptr]], *Ptr: 42
36+
// Implicit sync via cudaFree.
3737
Err = cudaFree(DevPtr);
38+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3839
if (Err != cudaSuccess)
3940
return -1;
4041
llvm_omp_target_free_shared(Ptr, DevNo);

offload/test/offloading/CUDA/basic_api_memcpy.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ int main(int argc, char **argv) {
3131
printf("Res: %i\n", Res);
3232
// CHECK: Res: 0
3333
kernel<<<1, 1>>>(DevPtr, 42);
34+
cudaDeviceSynchronize();
3435
Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
3536
if (Err != cudaSuccess)
3637
return -1;

offload/test/offloading/CUDA/basic_api_memset.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
3434
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3535
// CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
3636
kernel<<<1, 1>>>(Ptr, DevPtr, 42);
37+
cudaDeviceSynchronize();
3738
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
3839
// CHECK: Ptr [[Ptr]], *Ptr: 42
3940
Err = cudaFree(DevPtr);

0 commit comments

Comments
 (0)