[Offload] Introduce the concept of "default streams"

jdoerfert · jdoerfert · commit 705de43498ae · 2024-06-13T01:36:32.000-07:00
The offload APIs, and the CUDA wrappers in clang, now support "default streams" per thread (and per device). It should be per context but we don't really expose that concept yet. The KernelArguments allow an LLVM/Offload user to provide a "AsyncInfoQueue", which is plugin dependent and can later also be created outside or queried from the runtime. User managed "queues" are kept persistent, thus not returned to the pool once synchronized. The CUDA tests will synchronize via `cudaDeviceSynchronize` before checking the results. Based on: #94821
diff --git a/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h b/clang/lib/Headers/llvm_offload_wrappers/cuda_runtime.h
@@ -11,6 +11,7 @@
 #define __CUDA_RUNTIME_API__
 
 #include <cstddef>
+#include <cstdint>
 #include <optional>
 
 extern "C" {
@@ -21,6 +22,8 @@ int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                       size_t DstOffset, size_t SrcOffset, int DstDevice,
                       int SrcDevice);
 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
+int __tgt_target_synchronize_async_info_queue(void *Loc, int64_t DeviceNum,
+                                              void *AsyncInfoQueue);
 }
 
 // TODO: There are many fields missing in this enumeration.
@@ -55,7 +58,15 @@ inline cudaError_t cudaGetLastError() {
 // Returns the last error that has been produced without reseting it.
 inline cudaError_t cudaPeekAtLastError() { return __cudaomp_last_error; }
 
+inline cudaError_t cudaDeviceSynchronize() {
+  int DeviceNum = 0;
+  return __cudaomp_last_error =
+             (cudaError_t)__tgt_target_synchronize_async_info_queue(
+                 /*Loc=*/nullptr, DeviceNum, /*AsyncInfoQueue=*/nullptr);
+}
+
 inline cudaError_t __cudaMalloc(void **devPtr, size_t size) {
+  cudaDeviceSynchronize();
   int DeviceNum = 0;
   *devPtr = omp_target_alloc(size, DeviceNum);
   if (*devPtr == NULL)
@@ -69,6 +80,7 @@ template <class T> cudaError_t cudaMalloc(T **devPtr, size_t size) {
 }
 
 inline cudaError_t __cudaFree(void *devPtr) {
+  cudaDeviceSynchronize();
   int DeviceNum = 0;
   omp_target_free(devPtr, DeviceNum);
   return __cudaomp_last_error = cudaSuccess;
@@ -118,12 +130,8 @@ inline cudaError_t cudaMemset(T *devPtr, int value, size_t count) {
   return __cudaMemset((void *)devPtr, value, count);
 }
 
-inline cudaError_t cudaDeviceSynchronize() {
-  // TODO: not implemented, not async yet.
-  return __cudaomp_last_error = cudaSuccess;
-}
-
 inline cudaError_t cudaDeviceReset(void) {
+  cudaDeviceSynchronize();
   // TODO: not implemented.
   return __cudaomp_last_error = cudaSuccess;
 }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -72,7 +72,7 @@ enum class IdentFlag {
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
 
 // Version of the kernel argument format used by the omp runtime.
-#define OMP_KERNEL_ARG_VERSION 3
+#define OMP_KERNEL_ARG_VERSION 4
 
 // Minimum version of the compiler that generates a kernel dynamic pointer.
 #define OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR 3
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -90,7 +90,7 @@ __OMP_ARRAY_TYPE(Int32Arr3, Int32, 3)
 __OMP_STRUCT_TYPE(Ident, ident_t, false, Int32, Int32, Int32, Int32, Int8Ptr)
 __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidPtrPtr,
 		  VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr, VoidPtrPtr,
-		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32)
+		  Int64, Int64, Int32Arr3Ty, Int32Arr3Ty, Int32, VoidPtr)
 __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -496,6 +496,7 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
   auto Int32Ty = Type::getInt32Ty(Builder.getContext());
   Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
   Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
+  Value *AsyncInfoQueue = Constant::getNullValue(Builder.getPtrTy());
 
   Value *NumTeams3D =
       Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
@@ -514,7 +515,8 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
                 Flags,
                 NumTeams3D,
                 NumThreads3D,
-                KernelArgs.DynCGGroupMem};
+                KernelArgs.DynCGGroupMem,
+                AsyncInfoQueue};
 }
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
@@ -85,6 +85,9 @@ struct __tgt_async_info {
   /// ensure it is a valid location while the transfer to the device is
   /// happening.
   KernelLaunchEnvironmentTy KernelLaunchEnvironment;
+
+  /// Flag to indicate the Queue should be persistent.
+  bool PersistentQueue = false;
 };
 
 /// This struct contains all of the arguments to a target kernel region launch.
@@ -110,12 +113,16 @@ struct KernelArgsTy {
    // The number of threads (for x,y,z dimension).
   uint32_t ThreadLimit[3] = {0, 0, 0};
   uint32_t DynCGroupMem = 0;     // Amount of dynamic cgroup memory requested.
+  // A __tgt_async_info queue pointer to be used for the kernel and all
+  // associated device interactions. The operations are implicitly made
+  // non-blocking.
+  void *AsyncInfoQueue = nullptr;
 };
 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
               "Invalid struct size");
 static_assert(sizeof(KernelArgsTy) ==
                   (8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
-                   4 * sizeof(void **) + 2 * sizeof(int64_t *)),
+                   5 * sizeof(void **) + 2 * sizeof(int64_t *)),
               "Invalid struct size");
 
 /// Flat array of kernel launch parameters and their total size.
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
@@ -136,8 +136,19 @@ class AsyncInfoTy {
   /// Synchronization method to be used.
   SyncTy SyncType;
 
-  AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
+  AsyncInfoTy(DeviceTy &Device,  SyncTy SyncType = SyncTy::BLOCKING) 
       : Device(Device), SyncType(SyncType) {}
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue)
+      : Device(Device), SyncType(AsyncInfoQueue ? SyncTy::NON_BLOCKING : SyncTy::BLOCKING) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+  AsyncInfoTy(DeviceTy &Device, void *AsyncInfoQueue, SyncTy SyncType)
+      : Device(Device), SyncType(SyncType) {
+    AsyncInfo.Queue = AsyncInfoQueue;
+    AsyncInfo.PersistentQueue = !!AsyncInfoQueue;
+  }
+
   ~AsyncInfoTy() { synchronize(); }
 
   /// Implicit conversion to the __tgt_async_info which is used in the
@@ -207,8 +218,9 @@ class TaskAsyncInfoWrapperTy {
   void **TaskAsyncInfoPtr = nullptr;
 
 public:
-  TaskAsyncInfoWrapperTy(DeviceTy &Device)
+  TaskAsyncInfoWrapperTy(DeviceTy &Device, void *AsyncInfoQueue=  nullptr) 
       : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
+    assert(!AsyncInfoQueue && "Async tasks do not support predefined async queue pointers!");
     // If we failed to acquired the current global thread id, we cannot
     // re-enqueue the current task. Thus we should use the local blocking async
     // info.
@@ -425,6 +437,8 @@ int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
                                  void *VAddr, bool IsRecord, bool SaveOutput,
                                  uint64_t &ReqPtrArgOffset);
 
+void *__tgt_target_get_default_queue(void *Loc, int64_t DeviceId);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2208,8 +2208,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2228,9 +2231,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (!(*CompletedOrErr))
       return Plugin::success();
 
-    // Once the stream is completed, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     return AMDGPUStreamManager.returnResource(Stream);
   }
@@ -2443,7 +2449,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Initialize the async info for interoperability purposes.
   Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
-    // TODO: Implement this function.
+    AMDGPUStreamTy *Stream;
+    if (auto Err = getStream(AsyncInfoWrapper, Stream))
+      return Err;
+
     return Plugin::success();
   }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1435,8 +1435,10 @@ Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
 
 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
   assert(AsyncInfoPtr && "Invalid async info");
+  assert(!(*AsyncInfoPtr) && "Already initialized async info");
 
   *AsyncInfoPtr = new __tgt_async_info();
+  (*AsyncInfoPtr)->PersistentQueue = true;
 
   AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -643,8 +643,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     }
 
     // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
@@ -777,9 +780,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_ERROR_NOT_READY)
       return Plugin::success();
 
-    // Once the stream is synchronized and the operations completed (or an error
-    // occurs), return it to stream pool and reset AsyncInfo. This is to make
-    // sure the synchronization only works for its own tasks.
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo if the queue is not persistent. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (AsyncInfo.PersistentQueue)
+      return Plugin::success();
+
     AsyncInfo.Queue = nullptr;
     if (auto Err = CUDAStreamManager.returnResource(Stream))
       return Err;
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
@@ -8,9 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
 
 #include "Shared/APITypes.h"
 
+#include <cstdint>
 #include <cstdio>
 
 struct dim3 {
@@ -56,10 +58,13 @@ unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
 int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
                         int32_t ThreadLimit, const void *HostPtr,
                         KernelArgsTy *Args);
+void *__tgt_target_get_default_async_info_queue(void *Loc, int64_t DeviceId);
 
 unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
                           void *args, size_t sharedMem, void *stream) {
+  int64_t DeviceNo = 0;
   KernelArgsTy Args = {};
+  Args.Version = OMP_KERNEL_ARG_VERSION;
   Args.DynCGroupMem = sharedMem;
   Args.NumTeams[0] = gridDim.x;
   Args.NumTeams[1] = gridDim.y;
@@ -69,8 +74,13 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
   Args.ThreadLimit[2] = blockDim.z;
   Args.ArgPtrs = reinterpret_cast<void **>(args);
   Args.Flags.IsCUDA = true;
-  int rv = __tgt_target_kernel(nullptr, 0, gridDim.x,
-                               blockDim.x, func, &Args);
+  if (stream)
+    Args.AsyncInfoQueue = stream;
+  else
+    Args.AsyncInfoQueue =
+        __tgt_target_get_default_async_info_queue(nullptr, DeviceNo);
+  int rv = __tgt_target_kernel(nullptr, DeviceNo, gridDim.x, blockDim.x, func,
+                               &Args);
   return rv;
 }
 }
diff --git a/offload/src/exports b/offload/src/exports
@@ -29,6 +29,8 @@ VERS1.0 {
     __tgt_target_kernel;
     __tgt_target_kernel_nowait;
     __tgt_target_nowait_query;
+    __tgt_target_get_default_async_info_queue;
+    __tgt_target_synchronize_async_info_queue;
     __tgt_target_kernel_replay;
     __tgt_activate_record_replay;
     __tgt_mapper_num_components;
diff --git a/offload/src/interface.cpp b/offload/src/interface.cpp
@@ -14,6 +14,8 @@
 #include "OpenMP/OMPT/Interface.h"
 #include "OpenMP/OMPT/Callback.h"
 #include "PluginManager.h"
+#include "Shared/APITypes.h"
+#include "omptarget.h"
 #include "private.h"
 
 #include "Shared/EnvironmentVar.h"
@@ -312,7 +314,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
 
-  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr);
+  TargetAsyncInfoTy TargetAsyncInfo(*DeviceOrErr, KernelArgs->AsyncInfoQueue);
   AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
   /// RAII to establish tool anchors before and after target region
   OMPT_IF_BUILT(InterfaceRAII TargetRAII(
@@ -510,3 +512,48 @@ EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
   delete AsyncInfo;
   *AsyncHandle = nullptr;
 }
+
+EXTERN void *__tgt_target_get_default_async_info_queue(void *Loc,
+                                                       int64_t DeviceId) {
+  assert(PM && "Runtime not initialized");
+
+  static thread_local void **AsyncInfoQueue = nullptr;
+
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = reinterpret_cast<void **>(
+        calloc(PM->getNumDevices(), sizeof(AsyncInfoQueue[0])));
+
+  if (!AsyncInfoQueue[DeviceId]) {
+    auto DeviceOrErr = PM->getDevice(DeviceId);
+    if (!DeviceOrErr)
+      FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+    __tgt_async_info *AsyncInfo = nullptr;
+    DeviceOrErr->RTL->init_async_info(DeviceId, &AsyncInfo);
+    AsyncInfoQueue[DeviceId] = AsyncInfo->Queue;
+  }
+
+  return AsyncInfoQueue[DeviceId];
+}
+
+EXTERN int __tgt_target_synchronize_async_info_queue(void *Loc,
+                                                     int64_t DeviceId,
+                                                     void *AsyncInfoQueue) {
+  assert(PM && "Runtime not initialized");
+
+  auto DeviceOrErr = PM->getDevice(DeviceId);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
+  if (!AsyncInfoQueue)
+    AsyncInfoQueue = __tgt_target_get_default_async_info_queue(Loc, DeviceId);
+  AsyncInfoTy AsyncInfo(*DeviceOrErr, AsyncInfoQueue,
+                        AsyncInfoTy::SyncTy::BLOCKING);
+
+  if (AsyncInfo.synchronize())
+    FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
+  [[maybe_unused]] __tgt_async_info *ASI = AsyncInfo;
+  assert(ASI->Queue);
+  assert(ASI->Queue && ASI->PersistentQueue);
+
+  return 0;
+}
diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp
@@ -49,7 +49,7 @@ int AsyncInfoTy::synchronize() {
     case SyncTy::BLOCKING:
       // If we have a queue we need to synchronize it now.
       Result = Device.synchronize(*this);
-      assert(AsyncInfo.Queue == nullptr &&
+      assert((AsyncInfo.PersistentQueue || !AsyncInfo.Queue) &&
              "The device plugin should have nulled the queue to indicate there "
              "are no outstanding actions!");
       break;
diff --git a/offload/test/offloading/CUDA/basic_api_malloc_free.cu b/offload/test/offloading/CUDA/basic_api_malloc_free.cu
@@ -32,9 +32,10 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
   kernel<<<1, 1>>>(Ptr, DevPtr, 42);
-  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
+  // Implicit sync via cudaFree.
   Err = cudaFree(DevPtr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   if (Err != cudaSuccess)
     return -1;
   llvm_omp_target_free_shared(Ptr, DevNo);
diff --git a/offload/test/offloading/CUDA/basic_api_memcpy.cu b/offload/test/offloading/CUDA/basic_api_memcpy.cu
@@ -31,6 +31,7 @@ int main(int argc, char **argv) {
   printf("Res: %i\n", Res);
   // CHECK: Res: 0
   kernel<<<1, 1>>>(DevPtr, 42);
+  cudaDeviceSynchronize();
   Err = cudaMemcpy(HstPtr, DevPtr, 42 * sizeof(int), cudaMemcpyDeviceToHost);
   if (Err != cudaSuccess)
     return -1;
diff --git a/offload/test/offloading/CUDA/basic_api_memset.cu b/offload/test/offloading/CUDA/basic_api_memset.cu
@@ -34,6 +34,7 @@ int main(int argc, char **argv) {
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
   kernel<<<1, 1>>>(Ptr, DevPtr, 42);
+  cudaDeviceSynchronize();
   printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
   // CHECK: Ptr [[Ptr]], *Ptr: 42
   Err = cudaFree(DevPtr);
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu