intel · rbegam · Feb 12, 2020 · Feb 29, 2020 · Feb 29, 2020 · Feb 29, 2020
@@ -11,7 +11,7 @@
 :test_exception_handling
 :test_group
 :test_h_item
-:test_handler
+#:test_handler
 :test_header
 :test_hierarchical
 :test_id
@@ -20,7 +20,7 @@
 :test_item
 :test_kernel
 :test_kernel_args
-:test_math_builtin_api
+#:test_math_builtin_api
 :test_multi_ptr
 :test_nd_item
 :test_nd_range
@@ -38,8 +38,7 @@
 :test_vector_api
 :test_vector_constructors
 :test_vector_load_store
-# Disable test to speedup testing until JIT is optimized
 #:test_vector_operators
 :test_vector_swizzle_assignment
-:test_vector_swizzles
-:test_vector_swizzles_opencl
+#:test_vector_swizzles
+#:test_vector_swizzles_opencl
@@ -18,7 +18,9 @@ namespace cuda {
 
 // Mem Object info: Retrieve the raw CUDA pointer from a cl_mem
 #define PI_CUDA_RAW_POINTER (0xFF01)
-// Context creation: Use the primary context instead of a custom one
+// Context creation: Use a primary CUDA context instead of a custom one by
+//                   providing a property value of PI_TRUE for the following
+//                   property ID.
 #define PI_CONTEXT_PROPERTIES_CUDA_PRIMARY (0xFF02)
 
 // PI Command Queue using Default stream

@@ -49,8 +49,8 @@ class context {
   /// @param AsyncHandler is an instance of async_handler.
   /// @param UseCUDAPrimaryContext is a bool determining whether to use the
   ///        primary context in the CUDA backend.
-  context(const device &Device, async_handler AsyncHandler = {},
-          bool UseCUDAPrimaryContext = false);
+  explicit context(const device &Device, async_handler AsyncHandler = {},
+                   bool UseCUDAPrimaryContext = false);
 
   /// Constructs a SYCL context instance using the provided platform.
   ///
@@ -63,8 +63,8 @@ class context {
   /// @param AsyncHandler is an instance of async_handler.
   /// @param UseCUDAPrimaryContext is a bool determining whether to use the
   ///        primary context in the CUDA backend.
-  context(const platform &Platform, async_handler AsyncHandler = {},
-          bool UseCUDAPrimaryContext = false);
+  explicit context(const platform &Platform, async_handler AsyncHandler = {},
+                   bool UseCUDAPrimaryContext = false);
 
   /// Constructs a SYCL context instance using list of devices.
   ///
@@ -78,8 +78,9 @@ class context {
   /// @param AsyncHandler is an instance of async_handler.
   /// @param UseCUDAPrimaryContext is a bool determining whether to use the
   ///        primary context in the CUDA backend.
-  context(const vector_class<device> &DeviceList,
-          async_handler AsyncHandler = {}, bool UseCUDAPrimaryContext = false);
+  explicit context(const vector_class<device> &DeviceList,
+                   async_handler AsyncHandler = {},
+                   bool UseCUDAPrimaryContext = false);
 
   /// Constructs a SYCL context instance from OpenCL cl_context.
   ///

@@ -136,7 +136,7 @@ struct sub_group {
   }
 
   template <typename T, access::address_space Space>
-  void store(multi_ptr<T, Space> dst, T &x) const {
+  void store(multi_ptr<T, Space> dst, const T &x) const {
     throw runtime_error("Subgroups are not supported on host device. ");
   }
 

@@ -239,7 +239,7 @@ class queue {
   /// @param Length is a number of bytes in the allocation.
   /// @param Advice is a device-defined advice for the specified allocation.
   /// @return an event representing advice operation.
-  event mem_advise(const void *Ptr, size_t Length, int Advice);
+  event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice);
 
   /// Provides hints to the runtime library that data should be made available
   /// on a device earlier than Unified Shared Memory would normally require it

@@ -528,43 +528,57 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
                               pi_uint32 *num_platforms) {
 
   try {
-    static constexpr pi_uint32 numPlatforms = 1;
+    static std::once_flag initFlag;
+    static pi_uint32 numPlatforms = 1;
+    static _pi_platform platformId;
 
-    if (num_platforms != nullptr) {
-      *num_platforms = numPlatforms;
+    if (num_entries == 0 and platforms != nullptr) {
+      return PI_INVALID_VALUE;
+    }
+    if (platforms == nullptr and num_platforms == nullptr) {
+      return PI_INVALID_VALUE;
     }
 
     pi_result err = PI_SUCCESS;
 
-    if (platforms != nullptr) {
-
-      assert(num_entries != 0);
-
-      static std::once_flag initFlag;
-      static _pi_platform platformId;
-      std::call_once(
-          initFlag,
-          [](pi_result &err) {
-            err = PI_CHECK_ERROR(cuInit(0));
-
-            int numDevices = 0;
-            err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
+    std::call_once(
+        initFlag,
+        [](pi_result &err) {
+          if (cuInit(0) != CUDA_SUCCESS) {
+            numPlatforms = 0;
+            return;
+          }
+          int numDevices = 0;
+          err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
+          if (numDevices == 0) {
+            numPlatforms = 0;
+            return;
+          }
+          try {
             platformId.devices_.reserve(numDevices);
-            try {
-              for (int i = 0; i < numDevices; ++i) {
-                CUdevice device;
-                err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
-                platformId.devices_.emplace_back(
-                    new _pi_device{device, &platformId});
-              }
-            } catch (...) {
-              // Clear and rethrow to allow retry
-              platformId.devices_.clear();
-              throw;
+            for (int i = 0; i < numDevices; ++i) {
+              CUdevice device;
+              err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
+              platformId.devices_.emplace_back(
+                  new _pi_device{device, &platformId});
             }
-          },
-          err);
+          } catch (const std::bad_alloc &) {
+            // Signal out-of-memory situation
+            platformId.devices_.clear();
+            err = PI_OUT_OF_HOST_MEMORY;
+          } catch (...) {
+            // Clear and rethrow to allow retry
+            platformId.devices_.clear();
+            throw;
+          }
+        },
+        err);
 
+    if (num_platforms != nullptr) {
+      *num_platforms = numPlatforms;
+    }
+
+    if (platforms != nullptr) {
       *platforms = &platformId;
     }
 
@@ -1110,12 +1124,30 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
 }
 
 /* Context APIs */
-pi_result cuda_piContextCreate(const cl_context_properties *properties,
-                                pi_uint32 num_devices, const pi_device *devices,
-                                void (*pfn_notify)(const char *errinfo,
-                                                   const void *private_info,
-                                                   size_t cb, void *user_data),
-                                void *user_data, pi_context *retcontext) {
+
+/// Create a PI CUDA context.
+///
+/// By default creates a scoped context and keeps the last active CUDA context
+/// on top of the CUDA context stack.
+/// With the PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of PI_TRUE
+/// creates a primary CUDA context and activates it on the CUDA context stack.
+///
+/// @param[in] properties 0 terminated array of key/id-value combinations. Can
+/// be nullptr. Only accepts property key/id PI_CONTEXT_PROPERTIES_CUDA_PRIMARY
+/// with a pi_bool value.
+/// @param[in] num_devices Number of devices to create the context for.
+/// @param[in] devices Devices to create the context for.
+/// @param[in] pfn_notify Callback, currently unused.
+/// @param[in] user_data User data for callback.
+/// @param[out] retcontext Set to created context on success.
+///
+/// @return PI_SUCCESS on success, otherwise an error return code.
+pi_result cuda_piContextCreate(const pi_context_properties *properties,
+                               pi_uint32 num_devices, const pi_device *devices,
+                               void (*pfn_notify)(const char *errinfo,
+                                                  const void *private_info,
+                                                  size_t cb, void *user_data),
+                               void *user_data, pi_context *retcontext) {
 
   assert(devices != nullptr);
   // TODO: How to implement context callback?
@@ -1127,31 +1159,51 @@ pi_result cuda_piContextCreate(const cl_context_properties *properties,
   assert(retcontext != nullptr);
   pi_result errcode_ret = PI_SUCCESS;
 
+  // Parse properties.
+  bool property_cuda_primary = false;
+  while (properties && (0 != *properties)) {
+    // Consume property ID.
+    pi_context_properties id = *properties;
+    ++properties;
+    // Consume property value.
+    pi_context_properties value = *properties;
+    ++properties;
+    switch (id) {
+    case PI_CONTEXT_PROPERTIES_CUDA_PRIMARY:
+      assert(value == PI_FALSE || value == PI_TRUE);
+      property_cuda_primary = static_cast<bool>(value);
+      break;
+    default:
+      // Unknown property.
+      assert(!"Unknown piContextCreate property in property list");
+      return PI_INVALID_VALUE;
+    }
+  }
+
   std::unique_ptr<_pi_context> piContextPtr{nullptr};
   try {
-    if (properties && *properties != PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
-      throw pi_result(CL_INVALID_VALUE);
-    } else if (!properties) {
+    if (property_cuda_primary) {
+      // Use the CUDA primary context and assume that we want to use it
+      // immediately as we want to forge context switches.
+      CUcontext Ctxt;
+      errcode_ret = PI_CHECK_ERROR(
+          cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->cuDevice_));
+      piContextPtr = std::unique_ptr<_pi_context>(
+          new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
+      errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
+    } else {
+      // Create a scoped context.
       CUcontext newContext, current;
       PI_CHECK_ERROR(cuCtxGetCurrent(&current));
-      errcode_ret = PI_CHECK_ERROR(cuCtxCreate(&newContext, CU_CTX_MAP_HOST,
-                                            (*devices)->cuDevice_));
+      errcode_ret = PI_CHECK_ERROR(
+          cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->cuDevice_));
       piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
           _pi_context::kind::user_defined, newContext, *devices});
+      // For scoped contexts keep the last active CUDA one on top of the stack
+      // as `cuCtxCreate` replaces it implicitly otherwise.
       if (current != nullptr) {
-        // If there was an existing context on the thread we recover it
         PI_CHECK_ERROR(cuCtxSetCurrent(current));
       }
-    } else if (properties 
-                  && *properties == PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
-      CUcontext Ctxt;
-      errcode_ret = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(
-                                     &Ctxt, (*devices)->cuDevice_));
-      piContextPtr = std::unique_ptr<_pi_context>(
-          new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
-      errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
-    } else {
-      throw pi_result(CL_INVALID_VALUE);
     }
 
     *retcontext = piContextPtr.release();
@@ -1178,11 +1230,14 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
     CUcontext cuCtxt = ctxt->get();
     CUcontext current = nullptr;
     cuCtxGetCurrent(&current);
-    if(cuCtxt != current)
-    {
-      PI_CHECK_ERROR(cuCtxSetCurrent(cuCtxt));
+    if (cuCtxt != current) {
+      PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
     }
     PI_CHECK_ERROR(cuCtxSynchronize());
+    cuCtxGetCurrent(&current);
+    if (cuCtxt == current) {
+      PI_CHECK_ERROR(cuCtxPopCurrent(&current));
+    }
     return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
   } else {
     // Primary context is not destroyed, but released
@@ -1253,6 +1308,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) {
   pi_result ret = PI_SUCCESS;
 
   try {
+
     // Do nothing if there are other references
     if (memObj->decrement_reference_count() > 0) {
       return PI_SUCCESS;
@@ -1263,7 +1319,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) {
 
     if (!memObj->is_sub_buffer()) {
 
-      ScopedContext(uniqueMemObj->get_context());
+      ScopedContext active(uniqueMemObj->get_context());
 
       switch (uniqueMemObj->allocMode_) {
         case _pi_mem::alloc_mode::classic:

@@ -451,12 +451,12 @@ pi_result OCL(piextGetDeviceFunctionPointer)(pi_device device,
                                   function_pointer_ret));
 }
 
-pi_result OCL(piContextCreate)(
-    const cl_context_properties *properties, // TODO: untie from OpenCL
-    pi_uint32 num_devices, const pi_device *devices,
-    void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb,
-                       void *user_data1),
-    void *user_data, pi_context *retcontext) {
+pi_result OCL(piContextCreate)(const pi_context_properties *properties,
+                               pi_uint32 num_devices, const pi_device *devices,
+                               void (*pfn_notify)(const char *errinfo,
+                                                  const void *private_info,
+                                                  size_t cb, void *user_data1),
+                               void *user_data, pi_context *retcontext) {
   pi_result ret = PI_INVALID_OPERATION;
   *retcontext = cast<pi_context>(
       clCreateContext(properties, cast<cl_uint>(num_devices),

@@ -44,9 +44,8 @@ context_impl::context_impl(const vector_class<cl::sycl::device> Devices,
 
   if (MPlatform->is_cuda()) {
 #if USE_PI_CUDA
-    const cl_context_properties props[] = {
-        PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
-        0};
+    const pi_context_properties props[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
+                                           UseCUDAPrimaryContext, 0};
 
     getPlugin().call<PiApiKind::piContextCreate>(props, DeviceIds.size(), 
 	  	  DeviceIds.data(), nullptr, nullptr, &MContext);

@@ -40,8 +40,8 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
 
   size_t CompileWGSize[3] = {0};
   Plugin.call<PiApiKind::piKernelGetGroupInfo>(
-      Kernel, Device, PI_KERNEL_COMPILE_GROUP_INFO_SIZE, sizeof(size_t) * 3,
-      CompileWGSize, nullptr);
+      Kernel, Device, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+      sizeof(size_t) * 3, CompileWGSize, nullptr);
 
   if (CompileWGSize[0] != 0) {
     // OpenCL 1.x && 2.0:
@@ -90,10 +90,11 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
     // PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
     // total number of work-items in the work-group computed as
     // local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
-    // than the value specified by PI_KERNEL_GROUP_INFO_SIZE in table 5.21.
+    // than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in
+    // table 5.21.
     size_t KernelWGSize = 0;
     Plugin.call<PiApiKind::piKernelGetGroupInfo>(
-        Kernel, Device, PI_KERNEL_GROUP_INFO_SIZE, sizeof(size_t),
+        Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t),
         &KernelWGSize, nullptr);
     const size_t TotalNumberOfWIs =
         NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];