Skip to content

[WIP][SYCL] Replaces some of the CL_* enums with PI_* enums. #1221

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
84ddd10
[SYCL] Replaces some of the CL_* enums with PI_* enums.
rbegam Feb 12, 2020
6193e29
[SYCL] Rearranges enum pi_device_info.
rbegam Feb 29, 2020
071df8d
[SYCL] removes unneccessasy comments.
rbegam Feb 29, 2020
b3a9426
[SYCL] Fix mismatch between sub_group headers (#1215)
Pennycook Feb 29, 2020
3da5473
[SYCL] Fix command cleanup invoked from multiple threads (#1214)
sergey-semenov Mar 1, 2020
4b5d25b
[SYCL][NFC] Add clang-format configuration file for SYCL LIT tests (#…
bader Mar 1, 2020
c220eb8
[SYCL] Make context constructors explicit to avoid unintended convers…
jbrodman Mar 2, 2020
3035170
[SYCL] Disable tests which take more than 5 minutes (#1220)
vladimirlaz Mar 2, 2020
aa0619c
[SYCL] Fix check-sycl-deploy target problems (#1165)
Fznamznon Mar 2, 2020
745e759
[SYCL][CUDA] Handle the case of not having any CUDA device (#1212)
fwyzard Mar 2, 2020
b1aa222
[CUDA][PI] clang-format pi.h
bjoernknafla Feb 27, 2020
5e7ea06
[SYCL][CUDA] Fix context creation property parsing
bjoernknafla Feb 27, 2020
d214718
Update sycl/include/CL/sycl/detail/pi.h
rbegam Mar 2, 2020
399acef
Update sycl/include/CL/sycl/detail/pi.h
rbegam Mar 2, 2020
ef68270
[SYCL][CUDA] Fixes context release and unnamed context scope (#1207)
steffenlarsen Mar 2, 2020
a2bf2f1
[SYCL] Replaces some of the CL_* enums with PI_* enums.
rbegam Feb 12, 2020
ec0b39d
[SYCL] Rearranges enum pi_device_info.
rbegam Feb 29, 2020
5bf8bc9
[SYCL] Update sycl/include/CL/sycl/detail/pi.h
rbegam Mar 2, 2020
b6ec999
[SYCL] Resolves conflicts for sycl/include/CL/sycl/detail/pi.h
rbegam Mar 3, 2020
15c44dd
Merge branch 'private/rbegam/sycl-rename' of https://github.com/rbega…
rbegam Mar 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions buildbot/testlist.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
:test_exception_handling
:test_group
:test_h_item
:test_handler
#:test_handler
:test_header
:test_hierarchical
:test_id
Expand All @@ -20,7 +20,7 @@
:test_item
:test_kernel
:test_kernel_args
:test_math_builtin_api
#:test_math_builtin_api
:test_multi_ptr
:test_nd_item
:test_nd_range
Expand All @@ -38,8 +38,7 @@
:test_vector_api
:test_vector_constructors
:test_vector_load_store
# Disable test to speedup testing until JIT is optimized
#:test_vector_operators
:test_vector_swizzle_assignment
:test_vector_swizzles
:test_vector_swizzles_opencl
#:test_vector_swizzles
#:test_vector_swizzles_opencl
4 changes: 3 additions & 1 deletion sycl/include/CL/sycl/backend/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ namespace cuda {

// Mem Object info: Retrieve the raw CUDA pointer from a cl_mem
#define PI_CUDA_RAW_POINTER (0xFF01)
// Context creation: Use the primary context instead of a custom one
// Context creation: Use a primary CUDA context instead of a custom one by
// providing a property value of PI_TRUE for the following
// property ID.
#define PI_CONTEXT_PROPERTIES_CUDA_PRIMARY (0xFF02)

// PI Command Queue using Default stream
Expand Down
13 changes: 7 additions & 6 deletions sycl/include/CL/sycl/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ class context {
/// @param AsyncHandler is an instance of async_handler.
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
/// primary context in the CUDA backend.
context(const device &Device, async_handler AsyncHandler = {},
bool UseCUDAPrimaryContext = false);
explicit context(const device &Device, async_handler AsyncHandler = {},
bool UseCUDAPrimaryContext = false);

/// Constructs a SYCL context instance using the provided platform.
///
Expand All @@ -63,8 +63,8 @@ class context {
/// @param AsyncHandler is an instance of async_handler.
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
/// primary context in the CUDA backend.
context(const platform &Platform, async_handler AsyncHandler = {},
bool UseCUDAPrimaryContext = false);
explicit context(const platform &Platform, async_handler AsyncHandler = {},
bool UseCUDAPrimaryContext = false);

/// Constructs a SYCL context instance using list of devices.
///
Expand All @@ -78,8 +78,9 @@ class context {
/// @param AsyncHandler is an instance of async_handler.
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
/// primary context in the CUDA backend.
context(const vector_class<device> &DeviceList,
async_handler AsyncHandler = {}, bool UseCUDAPrimaryContext = false);
explicit context(const vector_class<device> &DeviceList,
async_handler AsyncHandler = {},
bool UseCUDAPrimaryContext = false);

/// Constructs a SYCL context instance from OpenCL cl_context.
///
Expand Down
1,232 changes: 567 additions & 665 deletions sycl/include/CL/sycl/detail/pi.h

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/intel/sub_group_host.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ struct sub_group {
}

template <typename T, access::address_space Space>
void store(multi_ptr<T, Space> dst, T &x) const {
void store(multi_ptr<T, Space> dst, const T &x) const {
throw runtime_error("Subgroups are not supported on host device. ");
}

Expand Down
2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ class queue {
/// @param Length is a number of bytes in the allocation.
/// @param Advice is a device-defined advice for the specified allocation.
/// @return an event representing advice operation.
event mem_advise(const void *Ptr, size_t Length, int Advice);
event mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice);

/// Provides hints to the runtime library that data should be made available
/// on a device earlier than Unified Shared Memory would normally require it
Expand Down
166 changes: 111 additions & 55 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -528,43 +528,57 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
pi_uint32 *num_platforms) {

try {
static constexpr pi_uint32 numPlatforms = 1;
static std::once_flag initFlag;
static pi_uint32 numPlatforms = 1;
static _pi_platform platformId;

if (num_platforms != nullptr) {
*num_platforms = numPlatforms;
if (num_entries == 0 and platforms != nullptr) {
return PI_INVALID_VALUE;
}
if (platforms == nullptr and num_platforms == nullptr) {
return PI_INVALID_VALUE;
}

pi_result err = PI_SUCCESS;

if (platforms != nullptr) {

assert(num_entries != 0);

static std::once_flag initFlag;
static _pi_platform platformId;
std::call_once(
initFlag,
[](pi_result &err) {
err = PI_CHECK_ERROR(cuInit(0));

int numDevices = 0;
err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
std::call_once(
initFlag,
[](pi_result &err) {
if (cuInit(0) != CUDA_SUCCESS) {
numPlatforms = 0;
return;
}
int numDevices = 0;
err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
if (numDevices == 0) {
numPlatforms = 0;
return;
}
try {
platformId.devices_.reserve(numDevices);
try {
for (int i = 0; i < numDevices; ++i) {
CUdevice device;
err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
platformId.devices_.emplace_back(
new _pi_device{device, &platformId});
}
} catch (...) {
// Clear and rethrow to allow retry
platformId.devices_.clear();
throw;
for (int i = 0; i < numDevices; ++i) {
CUdevice device;
err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
platformId.devices_.emplace_back(
new _pi_device{device, &platformId});
}
},
err);
} catch (const std::bad_alloc &) {
// Signal out-of-memory situation
platformId.devices_.clear();
err = PI_OUT_OF_HOST_MEMORY;
} catch (...) {
// Clear and rethrow to allow retry
platformId.devices_.clear();
throw;
}
},
err);

if (num_platforms != nullptr) {
*num_platforms = numPlatforms;
}

if (platforms != nullptr) {
*platforms = &platformId;
}

Expand Down Expand Up @@ -1110,12 +1124,30 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
}

/* Context APIs */
pi_result cuda_piContextCreate(const cl_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data),
void *user_data, pi_context *retcontext) {

/// Create a PI CUDA context.
///
/// By default creates a scoped context and keeps the last active CUDA context
/// on top of the CUDA context stack.
/// With the PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of PI_TRUE
/// creates a primary CUDA context and activates it on the CUDA context stack.
///
/// @param[in] properties 0 terminated array of key/id-value combinations. Can
/// be nullptr. Only accepts property key/id PI_CONTEXT_PROPERTIES_CUDA_PRIMARY
/// with a pi_bool value.
/// @param[in] num_devices Number of devices to create the context for.
/// @param[in] devices Devices to create the context for.
/// @param[in] pfn_notify Callback, currently unused.
/// @param[in] user_data User data for callback.
/// @param[out] retcontext Set to created context on success.
///
/// @return PI_SUCCESS on success, otherwise an error return code.
pi_result cuda_piContextCreate(const pi_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data),
void *user_data, pi_context *retcontext) {

assert(devices != nullptr);
// TODO: How to implement context callback?
Expand All @@ -1127,31 +1159,51 @@ pi_result cuda_piContextCreate(const cl_context_properties *properties,
assert(retcontext != nullptr);
pi_result errcode_ret = PI_SUCCESS;

// Parse properties.
bool property_cuda_primary = false;
while (properties && (0 != *properties)) {
// Consume property ID.
pi_context_properties id = *properties;
++properties;
// Consume property value.
pi_context_properties value = *properties;
++properties;
switch (id) {
case PI_CONTEXT_PROPERTIES_CUDA_PRIMARY:
assert(value == PI_FALSE || value == PI_TRUE);
property_cuda_primary = static_cast<bool>(value);
break;
default:
// Unknown property.
assert(!"Unknown piContextCreate property in property list");
return PI_INVALID_VALUE;
}
}

std::unique_ptr<_pi_context> piContextPtr{nullptr};
try {
if (properties && *properties != PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
throw pi_result(CL_INVALID_VALUE);
} else if (!properties) {
if (property_cuda_primary) {
// Use the CUDA primary context and assume that we want to use it
// immediately as we want to forge context switches.
CUcontext Ctxt;
errcode_ret = PI_CHECK_ERROR(
cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->cuDevice_));
piContextPtr = std::unique_ptr<_pi_context>(
new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
} else {
// Create a scoped context.
CUcontext newContext, current;
PI_CHECK_ERROR(cuCtxGetCurrent(&current));
errcode_ret = PI_CHECK_ERROR(cuCtxCreate(&newContext, CU_CTX_MAP_HOST,
(*devices)->cuDevice_));
errcode_ret = PI_CHECK_ERROR(
cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->cuDevice_));
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
_pi_context::kind::user_defined, newContext, *devices});
// For scoped contexts keep the last active CUDA one on top of the stack
// as `cuCtxCreate` replaces it implicitly otherwise.
if (current != nullptr) {
// If there was an existing context on the thread we recover it
PI_CHECK_ERROR(cuCtxSetCurrent(current));
}
} else if (properties
&& *properties == PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
CUcontext Ctxt;
errcode_ret = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(
&Ctxt, (*devices)->cuDevice_));
piContextPtr = std::unique_ptr<_pi_context>(
new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
} else {
throw pi_result(CL_INVALID_VALUE);
}

*retcontext = piContextPtr.release();
Expand All @@ -1178,11 +1230,14 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
CUcontext cuCtxt = ctxt->get();
CUcontext current = nullptr;
cuCtxGetCurrent(&current);
if(cuCtxt != current)
{
PI_CHECK_ERROR(cuCtxSetCurrent(cuCtxt));
if (cuCtxt != current) {
PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
}
PI_CHECK_ERROR(cuCtxSynchronize());
cuCtxGetCurrent(&current);
if (cuCtxt == current) {
PI_CHECK_ERROR(cuCtxPopCurrent(&current));
}
return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
} else {
// Primary context is not destroyed, but released
Expand Down Expand Up @@ -1253,6 +1308,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) {
pi_result ret = PI_SUCCESS;

try {

// Do nothing if there are other references
if (memObj->decrement_reference_count() > 0) {
return PI_SUCCESS;
Expand All @@ -1263,7 +1319,7 @@ pi_result cuda_piMemRelease(pi_mem memObj) {

if (!memObj->is_sub_buffer()) {

ScopedContext(uniqueMemObj->get_context());
ScopedContext active(uniqueMemObj->get_context());

switch (uniqueMemObj->allocMode_) {
case _pi_mem::alloc_mode::classic:
Expand Down
12 changes: 6 additions & 6 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,12 +451,12 @@ pi_result OCL(piextGetDeviceFunctionPointer)(pi_device device,
function_pointer_ret));
}

pi_result OCL(piContextCreate)(
const cl_context_properties *properties, // TODO: untie from OpenCL
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb,
void *user_data1),
void *user_data, pi_context *retcontext) {
pi_result OCL(piContextCreate)(const pi_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data1),
void *user_data, pi_context *retcontext) {
pi_result ret = PI_INVALID_OPERATION;
*retcontext = cast<pi_context>(
clCreateContext(properties, cast<cl_uint>(num_devices),
Expand Down
5 changes: 2 additions & 3 deletions sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ context_impl::context_impl(const vector_class<cl::sycl::device> Devices,

if (MPlatform->is_cuda()) {
#if USE_PI_CUDA
const cl_context_properties props[] = {
PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
0};
const pi_context_properties props[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
UseCUDAPrimaryContext, 0};

getPlugin().call<PiApiKind::piContextCreate>(props, DeviceIds.size(),
DeviceIds.data(), nullptr, nullptr, &MContext);
Expand Down
9 changes: 5 additions & 4 deletions sycl/source/detail/error_handling/enqueue_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,

size_t CompileWGSize[3] = {0};
Plugin.call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_COMPILE_GROUP_INFO_SIZE, sizeof(size_t) * 3,
CompileWGSize, nullptr);
Kernel, Device, PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
sizeof(size_t) * 3, CompileWGSize, nullptr);

if (CompileWGSize[0] != 0) {
// OpenCL 1.x && 2.0:
Expand Down Expand Up @@ -90,10 +90,11 @@ bool handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
// PI_INVALID_WORK_GROUP_SIZE if local_work_size is specified and the
// total number of work-items in the work-group computed as
// local_work_size[0] * ... * local_work_size[work_dim – 1] is greater
// than the value specified by PI_KERNEL_GROUP_INFO_SIZE in table 5.21.
// than the value specified by PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE in
// table 5.21.
size_t KernelWGSize = 0;
Plugin.call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_SIZE, sizeof(size_t),
Kernel, Device, PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE, sizeof(size_t),
&KernelWGSize, nullptr);
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
Expand Down
Loading