Skip to content

Commit 43ba606

Browse files
authored
[SYCL][L0] Implement pi_device and pi_platform cache (#2227)
The current implementation piDevicesGet and piPlatformsGet always create new pi_device and pi_platform object even if the low-level ze_handles are the same. This makes SYCL RT difficult to determine whether sycl::device is the same. Same issue applies to sycl::platform. By implementing cache, it can avoid calling expensive L0 RT and return the saved pi_device and pi_platform from the cache. This should help remove the memory leak and improve the overall performance of plugins.
1 parent e6fd911 commit 43ba606

File tree

3 files changed

+56
-14
lines changed

3 files changed

+56
-14
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
#include <string>
2121
#include <thread>
2222
#include <utility>
23-
#include <vector>
2423

2524
#include <level_zero/zet_api.h>
2625

@@ -487,6 +486,13 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
487486
return PI_INVALID_VALUE;
488487
}
489488

489+
// Cache pi_platforms for reuse in the future
490+
// It solves two problems;
491+
// 1. sycl::device equality issue; we always return the same pi_device.
492+
// 2. performance; we can save time by immediately return from cache.
493+
static std::vector<pi_platform> PiPlatformsCache;
494+
static std::mutex PiPlatformsCacheMutex;
495+
490496
// This is a good time to initialize Level Zero.
491497
// TODO: We can still safely recover if something goes wrong during the init.
492498
// Implement handling segfault using sigaction.
@@ -521,6 +527,18 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
521527
assert(ZeDriverCount == 1);
522528
ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver));
523529

530+
std::lock_guard<std::mutex> Lock(PiPlatformsCacheMutex);
531+
for (const pi_platform CachedPlatform : PiPlatformsCache) {
532+
if (CachedPlatform->ZeDriver == ZeDriver) {
533+
Platforms[0] = CachedPlatform;
534+
// if the caller sent a valid NumPlatforms pointer, set it here
535+
if (NumPlatforms)
536+
*NumPlatforms = 1;
537+
538+
return PI_SUCCESS;
539+
}
540+
}
541+
524542
try {
525543
// TODO: figure out how/when to release this memory
526544
*Platforms = new _pi_platform(ZeDriver);
@@ -546,6 +564,9 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
546564
Platforms[0]->ZeDriverApiVersion =
547565
std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + std::string(".") +
548566
std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
567+
568+
// save a copy in the cache for future uses
569+
PiPlatformsCache.push_back(Platforms[0]);
549570
} catch (const std::bad_alloc &) {
550571
return PI_OUT_OF_HOST_MEMORY;
551572
} catch (...) {
@@ -639,9 +660,16 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
639660

640661
// Get number of devices supporting Level Zero
641662
uint32_t ZeDeviceCount = 0;
663+
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
664+
ZeDeviceCount = Platform->PiDevicesCache.size();
665+
642666
const bool AskingForGPU = (DeviceType & PI_DEVICE_TYPE_GPU);
643667
const bool AskingForDefault = (DeviceType == PI_DEVICE_TYPE_DEFAULT);
644-
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));
668+
669+
if (ZeDeviceCount == 0) {
670+
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr));
671+
}
672+
645673
if (ZeDeviceCount == 0 || !(AskingForGPU || AskingForDefault)) {
646674
if (NumDevices)
647675
*NumDevices = 0;
@@ -657,6 +685,14 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
657685
return PI_SUCCESS;
658686
}
659687

688+
// if devices are already captured in cache, return them from the cache.
689+
for (const pi_device CachedDevice : Platform->PiDevicesCache) {
690+
*Devices++ = CachedDevice;
691+
}
692+
if (!Platform->PiDevicesCache.empty()) {
693+
return PI_SUCCESS;
694+
}
695+
660696
try {
661697
std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
662698
ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices.data()));
@@ -668,6 +704,8 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
668704
if (Result != PI_SUCCESS) {
669705
return Result;
670706
}
707+
// save a copy in the cache for future uses.
708+
Platform->PiDevicesCache.push_back(Devices[I]);
671709
}
672710
}
673711
} catch (const std::bad_alloc &) {
@@ -680,7 +718,6 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
680718

681719
pi_result piDeviceRetain(pi_device Device) {
682720
assert(Device);
683-
684721
// The root-device ref-count remains unchanged (always 1).
685722
if (Device->IsSubDevice) {
686723
++(Device->RefCount);
@@ -690,14 +727,16 @@ pi_result piDeviceRetain(pi_device Device) {
690727

691728
pi_result piDeviceRelease(pi_device Device) {
692729
assert(Device);
693-
730+
assert(Device->RefCount > 0 && "Device is already released.");
694731
// TODO: OpenCL says root-device ref-count remains unchanged (1),
695732
// but when would we free the device's data?
696-
if (--(Device->RefCount) == 0) {
697-
// Destroy the command list used for initializations
698-
ZE_CALL(zeCommandListDestroy(Device->ZeCommandListInit));
699-
delete Device;
700-
}
733+
if (Device->IsSubDevice)
734+
--(Device->RefCount);
735+
// TODO: All cached pi_devices live until the program ends.
736+
// If L0 RT does not do its own cleanup for Ze_Device_Handle upon tear-down,
737+
// we need to figure out a way to call here
738+
// ZE_CALL(zeCommandListDestroy(Device->ZeCommandListInit)); and,
739+
// in piDevicesGet(), we need to call initialize for each cached pi_device.
701740

702741
return PI_SUCCESS;
703742
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ struct _pi_platform {
7272
// Cache versions info from zeDriverGetProperties.
7373
std::string ZeDriverVersion;
7474
std::string ZeDriverApiVersion;
75+
76+
// Cache pi_devices for reuse
77+
std::vector<pi_device> PiDevicesCache;
78+
std::mutex PiDevicesCacheMutex;
7579
};
7680

7781
struct _pi_device : _pi_object {

sycl/source/detail/device_impl.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle,
5858
nullptr);
5959

6060
MIsRootDevice = (nullptr == parent);
61-
if (!MIsRootDevice && !InteroperabilityConstructor) {
61+
if (!InteroperabilityConstructor) {
6262
// TODO catch an exception and put it to list of asynchronous exceptions
6363
// Interoperability Constructor already calls DeviceRetain in
6464
// piextDeviceFromNative.
@@ -98,10 +98,9 @@ cl_device_id device_impl::get() const {
9898
PI_INVALID_DEVICE);
9999

100100
const detail::plugin &Plugin = getPlugin();
101-
if (!MIsRootDevice) {
102-
// TODO catch an exception and put it to list of asynchronous exceptions
103-
Plugin.call<PiApiKind::piDeviceRetain>(MDevice);
104-
}
101+
102+
// TODO catch an exception and put it to list of asynchronous exceptions
103+
Plugin.call<PiApiKind::piDeviceRetain>(MDevice);
105104
return pi::cast<cl_device_id>(getNative());
106105
}
107106

0 commit comments

Comments
 (0)