Skip to content

[SYCL]: basic support of contexts with multiple devices in Level-Zero #2440

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 129 additions & 79 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
/// \ingroup sycl_pi_level_zero

#include "pi_level_zero.hpp"
#include <algorithm>
#include <cstdarg>
#include <cstdio>
#include <cstring>
Expand Down Expand Up @@ -219,9 +220,13 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool,
ZeEventPoolDesc.count = MaxNumEventsPerPool;
ZeEventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;

ze_device_handle_t ZeDevice = Device->ZeDevice;
if (ze_result_t ZeRes = zeEventPoolCreate(ZeContext, &ZeEventPoolDesc, 1,
&ZeDevice, &ZeEventPool))
std::vector<ze_device_handle_t> ZeDevices;
std::for_each(Devices.begin(), Devices.end(),
[&](pi_device &D) { ZeDevices.push_back(D->ZeDevice); });

if (ze_result_t ZeRes =
zeEventPoolCreate(ZeContext, &ZeEventPoolDesc, ZeDevices.size(),
&ZeDevices[0], &ZeEventPool))
return ZeRes;
NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool - 1;
NumEventsLiveInEventPool[ZeEventPool] = MaxNumEventsPerPool;
Expand Down Expand Up @@ -408,9 +413,9 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
ZE_CALL(zeFenceReset(this->ZeCommandListFenceMap[ZeCommandList]));
ZE_CALL(zeCommandListReset(ZeCommandList));
if (MakeAvailable) {
this->Context->Device->ZeCommandListCacheMutex.lock();
this->Context->Device->ZeCommandListCache.push_back(ZeCommandList);
this->Context->Device->ZeCommandListCacheMutex.unlock();
this->Device->ZeCommandListCacheMutex.lock();
this->Device->ZeCommandListCache.push_back(ZeCommandList);
this->Device->ZeCommandListCacheMutex.unlock();
}

return PI_SUCCESS;
Expand All @@ -433,7 +438,7 @@ _pi_device::getAvailableCommandList(pi_queue Queue,

// Initally, we need to check if a command list has already been created
// on this device that is available for use. If so, then reuse that
// L0 Command List and Fence for this PI call.
// Level-Zero Command List and Fence for this PI call.
if (Queue->Device->ZeCommandListCache.size() > 0) {
Queue->Device->ZeCommandListCacheMutex.lock();
*ZeCommandList = Queue->Device->ZeCommandListCache.front();
Expand Down Expand Up @@ -749,11 +754,25 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
assert(Platform);

// Create PI platform from the given Level Zero driver handle.
// TODO: get the platform from the platforms' cache.
auto ZeDriver = pi_cast<ze_driver_handle_t>(NativeHandle);
*Platform = new _pi_platform(ZeDriver);
return PI_SUCCESS;
}

// Get the cahched PI device created for the L0 device handle.
// Return NULL if no such PI device found.
pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {

std::lock_guard<std::mutex> Lock(this->PiDevicesCacheMutex);
auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
[&](pi_device &D) { return D->ZeDevice == ZeDevice; });
if (it != PiDevicesCache.end()) {
return *it;
}
return nullptr;
}

pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
pi_uint32 NumEntries, pi_device *Devices,
pi_uint32 *NumDevices) {
Expand Down Expand Up @@ -1391,6 +1410,7 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
assert(Platform);

// Create PI device from the given Level Zero device handle.
// TODO: get the device from the devices' cache.
auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
*Device = new _pi_device(ZeDevice, Platform);
return (*Device)->initialize();
Expand All @@ -1402,15 +1422,14 @@ pi_result piContextCreate(const pi_context_properties *Properties,
const void *PrivateInfo, size_t CB,
void *UserData),
void *UserData, pi_context *RetContext) {
if (NumDevices != 1 || !Devices) {
zePrint("piCreateContext: context should have exactly one Device\n");
if (!Devices) {
return PI_INVALID_VALUE;
}

assert(RetContext);

try {
*RetContext = new _pi_context(*Devices);
*RetContext = new _pi_context(NumDevices, Devices);
} catch (const std::bad_alloc &) {
return PI_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down Expand Up @@ -1444,9 +1463,10 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
switch (ParamName) {
case PI_CONTEXT_INFO_DEVICES:
return ReturnValue(Context->Device);
return getInfoArray(Context->Devices.size(), ParamValueSize, ParamValue,
ParamValueSizeRet, &Context->Devices[0]);
case PI_CONTEXT_INFO_NUM_DEVICES:
return ReturnValue(pi_uint32{1});
return ReturnValue(pi_uint32(Context->Devices.size()));
case PI_CONTEXT_INFO_REFERENCE_COUNT:
return ReturnValue(pi_uint32{Context->RefCount});
default:
Expand Down Expand Up @@ -1521,7 +1541,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
if (!Context) {
return PI_INVALID_CONTEXT;
}
if (Context->Device != Device) {
if (std::find(Context->Devices.begin(), Context->Devices.end(), Device) ==
Context->Devices.end()) {
return PI_INVALID_DEVICE;
}

Expand Down Expand Up @@ -1628,7 +1649,11 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
assert(Queue);

auto ZeQueue = pi_cast<ze_command_queue_handle_t>(NativeHandle);
*Queue = new _pi_queue(ZeQueue, Context, Context->Device);

// Attach the queue to the "0" device.
// TODO: see if we need to let user choose the device.
pi_device Device = Context->Devices[0];
*Queue = new _pi_queue(ZeQueue, Context, Device);
return PI_SUCCESS;
}

Expand All @@ -1641,14 +1666,24 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
assert(RetMem);

void *Ptr;
ze_device_handle_t ZeDevice = Context->Device->ZeDevice;

ze_device_mem_alloc_desc_t ZeDesc = {};
ZeDesc.flags = 0;
ZeDesc.ordinal = 0;
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDesc, Size,
1, // TODO: alignment
ZeDevice, &Ptr));
ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
ZeDeviceMemDesc.flags = 0;
ZeDeviceMemDesc.ordinal = 0;

if (Context->Devices.size() == 1) {
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDeviceMemDesc, Size,
1, // TODO: alignment
Context->Devices[0]->ZeDevice, &Ptr));
} else {
ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
ZeHostMemDesc.flags = 0;
ZE_CALL(zeMemAllocShared(Context->ZeContext, &ZeDeviceMemDesc,
&ZeHostMemDesc, Size,
1, // TODO: alignment
nullptr, // not bound to any device
&Ptr));
}

if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
Expand Down Expand Up @@ -1837,9 +1872,17 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
ZeImageDesc.arraylevels = pi_cast<uint32_t>(ImageDesc->image_array_size);
ZeImageDesc.miplevels = ImageDesc->num_mip_levels;

// Have the "0" device in context to own the image. Rely on Level-Zero
// drivers to perform migration as necessary for sharing it across multiple
// devices in the context.
//
// TODO: figure out if we instead need explicit copying for acessing
// the image from other devices in the context.
//
pi_device Device = Context->Devices[0];
ze_image_handle_t ZeHImage;
ZE_CALL(zeImageCreate(Context->ZeContext, Context->Device->ZeDevice,
&ZeImageDesc, &ZeHImage));
ZE_CALL(zeImageCreate(Context->ZeContext, Device->ZeDevice, &ZeImageDesc,
&ZeHImage));

auto HostPtrOrNull =
(Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
Expand Down Expand Up @@ -1926,7 +1969,7 @@ pi_result piProgramCreateWithBinary(pi_context Context, pi_uint32 NumDevices,
*BinaryStatus = PI_INVALID_VALUE;
return PI_INVALID_VALUE;
}
if (DeviceList[0] != Context->Device)
if (DeviceList[0] != Context->Devices[0])
return PI_INVALID_DEVICE;

size_t Length = Lengths[0];
Expand Down Expand Up @@ -1975,10 +2018,11 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName,
case PI_PROGRAM_INFO_REFERENCE_COUNT:
return ReturnValue(pi_uint32{Program->RefCount});
case PI_PROGRAM_INFO_NUM_DEVICES:
// Level Zero Module is always for a single device.
// TODO: return true number of devices this program exists for.
return ReturnValue(pi_uint32{1});
case PI_PROGRAM_INFO_DEVICES:
return ReturnValue(Program->Context->Device);
// TODO: return all devices this program exists for.
return ReturnValue(Program->Context->Devices[0]);
case PI_PROGRAM_INFO_BINARY_SIZES: {
size_t SzBinary;
if (Program->State == _pi_program::IL ||
Expand Down Expand Up @@ -2105,9 +2149,10 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices,
void (*PFnNotify)(pi_program Program, void *UserData),
void *UserData, pi_program *RetProgram) {

// We only support one device with Level Zero.
// We only support one device with Level Zero currently.
pi_device Device = Context->Devices[0];
assert(NumDevices == 1);
assert(DeviceList && DeviceList[0] == Context->Device);
assert(DeviceList && DeviceList[0] == Device);
assert(!PFnNotify && !UserData);

// Validate input parameters.
Expand Down Expand Up @@ -2170,9 +2215,8 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices,
// only export symbols.
Guard.unlock();
ze_module_handle_t ZeModule;
pi_result res =
copyModule(Context->ZeContext, Context->Device->ZeDevice,
Input->ZeModule, &ZeModule);
pi_result res = copyModule(Context->ZeContext, Device->ZeDevice,
Input->ZeModule, &ZeModule);
if (res != PI_SUCCESS) {
return res;
}
Expand Down Expand Up @@ -2270,7 +2314,9 @@ static pi_result compileOrBuild(pi_program Program, pi_uint32 NumDevices,
if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
return PI_INVALID_VALUE;

// We only support one device with Level Zero.
// We only support build to one device with Level Zero now.
// TODO: we should eventually build to the possibly multiple root
// devices in the context.
assert(NumDevices == 1 && DeviceList);

// We should have either IL or native device code.
Expand Down Expand Up @@ -2307,7 +2353,7 @@ static pi_result compileOrBuild(pi_program Program, pi_uint32 NumDevices,
ZeModuleDesc.pBuildFlags = Options;
ZeModuleDesc.pConstants = &ZeSpecConstants;

ze_device_handle_t ZeDevice = Program->Context->Device->ZeDevice;
ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
ze_context_handle_t ZeContext = Program->Context->ZeContext;
ze_module_handle_t ZeModule;
ze_module_build_log_handle_t ZeBuildLog;
Expand Down Expand Up @@ -2905,7 +2951,8 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
ze_event_handle_t ZeEvent;
ze_event_desc_t ZeEventDesc = {};
// We have to set the SIGNAL & WAIT flags as HOST scope because the
// L0 plugin implementation waits for the events to complete on the host.
// Level-Zero plugin implementation waits for the events to complete
// on the host.
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
ZeEventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST;
ZeEventDesc.index = Index;
Expand Down Expand Up @@ -3111,7 +3158,14 @@ pi_result piSamplerCreate(pi_context Context,
assert(Context);
assert(RetSampler);

ze_device_handle_t ZeDevice = Context->Device->ZeDevice;
// Have the "0" device in context to own the sampler. Rely on Level-Zero
// drivers to perform migration as necessary for sharing it across multiple
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@smaslov-intel : What do you mean by rely on Level-Zero for migration across devices?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jandres742 , I mean that the image/sampler created on one device with zeImageCreate/zeSamplerCreate are made available for access in other devices of this SYCL context without any explicit transfers in the Level-Zero plugin. Note that for buffers I use zeMemAllocShared but image/sampler don't have shared counterparts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L0 does not implicitly migrate between devices, nor for images nor buffers. From here, https://spec.oneapi.com/level-zero/latest/core/PROG.html#memory:

Shared allocations: Migratable to: Another Device = Optional

Additionally. Access from one device to another device's buffers and images are not supported at all moments. Proper way of confirming such access exist is using zeDeviceCanAccessPeer() https://spec.oneapi.com/level-zero/latest/core/api.html?highlight=canaccess#zedevicecanaccesspeer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @jandres742 !

L0 does not implicitly migrate between devices, nor for images nor buffers.

Not even between sub-devices of the same root device?

Shared allocations: Migratable to: Another Device = Optional

So what Optional really means? Is this an option of what?

not supported at all moments

At what circumstances can we rely on it?

Proper way of confirming such access exist is using zeDeviceCanAccessPeer()

But we need a way to guarantee such sharing between devices in the same SYCL context.
Is there a way to force the migration that is optional?
Should we just copy explicitly if no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jandres742 , ping

I've added TODO comments for now and would like to proceed with this PR, please approve if you are OK with that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not even between sub-devices of the same root device?

There's no need for migrations between sub-devices, as they are part of the same device and each sub-device can access another sub-device's memory. Issue is when we are are referring about migrations between two separate physical devices.

So what Optional really means? Is this an option of what?

That means that it depends on HW and SW (L0 driver, and kernel driver) support. Not always would be present for all platforms.

But we need a way to guarantee such sharing between devices in the same SYCL context.
Is there a way to force the migration that is optional?
Should we just copy explicitly if no?

Exactly. The only way of enforcing migrations between devices is to implement a SW fallback with bounce buffering in the host.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good 👍

// devices in the context.
//
// TODO: figure out if we instead need explicit copying for acessing
// the sampler from other devices in the context.
//
pi_device Device = Context->Devices[0];

ze_sampler_handle_t ZeSampler;
ze_sampler_desc_t ZeSamplerDesc = {};
Expand Down Expand Up @@ -3199,7 +3253,7 @@ pi_result piSamplerCreate(pi_context Context,
}
}

ZE_CALL(zeSamplerCreate(Context->ZeContext, ZeDevice,
ZE_CALL(zeSamplerCreate(Context->ZeContext, Device->ZeDevice,
&ZeSamplerDesc, // TODO: translate properties
&ZeSampler));

Expand Down Expand Up @@ -4241,49 +4295,44 @@ pi_result piextUSMFree(pi_context Context, void *Ptr) {
ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};

// Query memory type of the pointer we're freeing to determine the correct
// way to do it(directly or via the allocator)
// way to do it(directly or via an allocator)
ZE_CALL(zeMemGetAllocProperties(
Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, &ZeDeviceHandle));

// TODO: when support for multiple devices is implemented, here
// we should do the following:
// - Find pi_device instance corresponding to ZeDeviceHandle we've just got if
// exist
// - Use that pi_device to find the right allocator context and free the
// pointer.

// The allocation doesn't belong to any device for which USM allocator is
// enabled.
if (Context->Device->ZeDevice != ZeDeviceHandle) {
return USMFreeImpl(Context, Ptr);
}

auto DeallocationHelper =
[Context,
Ptr](std::unordered_map<pi_device, USMAllocContext> &AllocContextMap) {
try {
auto It = AllocContextMap.find(Context->Device);
if (It == AllocContextMap.end())
return PI_INVALID_VALUE;

// The right context is found, deallocate the pointer
It->second.deallocate(Ptr);
} catch (const UsmAllocationException &Ex) {
return Ex.getError();
}
if (ZeDeviceHandle) {
// All devices in the context are of the same platform.
auto Platform = Context->Devices[0]->Platform;
auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
assert(Device);

auto DeallocationHelper =
[Context, Device,
Ptr](std::unordered_map<pi_device, USMAllocContext> &AllocContextMap) {
try {
auto It = AllocContextMap.find(Device);
if (It == AllocContextMap.end())
return PI_INVALID_VALUE;

// The right context is found, deallocate the pointer
It->second.deallocate(Ptr);
} catch (const UsmAllocationException &Ex) {
return Ex.getError();
}

return PI_SUCCESS;
};
return PI_SUCCESS;
};

switch (ZeMemoryAllocationProperties.type) {
case ZE_MEMORY_TYPE_SHARED:
return DeallocationHelper(Context->SharedMemAllocContexts);
case ZE_MEMORY_TYPE_DEVICE:
return DeallocationHelper(Context->DeviceMemAllocContexts);
default:
// Handled below
break;
switch (ZeMemoryAllocationProperties.type) {
case ZE_MEMORY_TYPE_SHARED:
return DeallocationHelper(Context->SharedMemAllocContexts);
case ZE_MEMORY_TYPE_DEVICE:
return DeallocationHelper(Context->DeviceMemAllocContexts);
default:
// Handled below
break;
}
}

return USMFreeImpl(Context, Ptr);
}

Expand Down Expand Up @@ -4519,14 +4568,15 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr,
}
return ReturnValue(MemAllocaType);
}
case PI_MEM_ALLOC_DEVICE: {
case PI_MEM_ALLOC_DEVICE:
if (ZeDeviceHandle) {
if (Context->Device->ZeDevice == ZeDeviceHandle) {
return ReturnValue(Context->Device);
}
// All devices in the context are of the same platform.
auto Platform = Context->Devices[0]->Platform;
auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
return Device ? ReturnValue(Device) : PI_INVALID_VALUE;
} else {
return PI_INVALID_VALUE;
}
return PI_INVALID_VALUE;
}
case PI_MEM_ALLOC_BASE_PTR: {
void *Base;
ZE_CALL(zeMemGetAddressRange(Context->ZeContext, Ptr, &Base, nullptr));
Expand Down
Loading