Skip to content

Commit 6d34b95

Browse files
authored
[SYCL][Deps]Uplift GPURT to 20.34.17727 with Level Zero plugin to specification v1.0 (#2409)
1 parent 174fd16 commit 6d34b95

File tree

13 files changed

+640
-356
lines changed

13 files changed

+640
-356
lines changed

buildbot/dependency.conf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ ocl_cpu_rt_ver=2020.11.8.0.27
44
# https://github.com/intel/llvm/releases/download/2020-WW36/win-oclcpuexp-2020.11.8.0.27_rel.zip
55
ocl_cpu_rt_ver_win=2020.11.8.0.27
66
# Same GPU driver supports Level Zero and OpenCL:
7-
# https://github.com/intel/compute-runtime/releases/tag/20.29.17408
8-
ocl_gpu_rt_ver=20.29.17408
7+
# https://github.com/intel/compute-runtime/releases/tag/20.34.17727
8+
ocl_gpu_rt_ver=20.34.17727
99
# Same GPU driver supports Level Zero and OpenCL:
10-
# https://downloadmirror.intel.com/29674/a08/igfx_win10_100.8336.zip
11-
ocl_gpu_rt_ver_win=27.20.100.8336
10+
# https://downloadmirror.intel.com/29817/a08/igfx_win10_100.8673.zip
11+
ocl_gpu_rt_ver_win=27.20.100.8673
1212
intel_sycl_ver=build
1313
# https://github.com/oneapi-src/oneTBB/releases/download/v2021.1-beta08/oneapi-tbb-2021.1-beta08-lin.tgz
1414
tbb_ver=2021.1.9.636
@@ -24,8 +24,8 @@ fpga_ver_win=20200811_000006
2424
[DRIVER VERSIONS]
2525
cpu_driver_lin=2020.11.8.0.27
2626
cpu_driver_win=2020.11.8.0.27
27-
gpu_driver_lin=20.29.17408
28-
gpu_driver_win=27.20.100.8336
27+
gpu_driver_lin=20.34.17727
28+
gpu_driver_win=27.20.100.8673
2929
fpga_driver_lin=2020.11.8.0.27
3030
fpga_driver_win=2020.11.8.0.27
3131
# NVidia CUDA driver

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ subject to change. Do not rely on these variables in production code.
2525
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. |
2626
| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
2727
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
28+
| SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
2829

2930
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
3031

sycl/include/CL/sycl/backend/level_zero.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ template <> struct interop<backend::level_zero, device> {
2323
using type = ze_device_handle_t;
2424
};
2525

26+
template <> struct interop<backend::level_zero, context> {
27+
using type = ze_context_handle_t;
28+
};
29+
2630
template <> struct interop<backend::level_zero, queue> {
2731
using type = ze_command_queue_handle_t;
2832
};

sycl/plugins/level_zero/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
2323
endif()
2424
ExternalProject_Add(level-zero-loader
2525
GIT_REPOSITORY https://github.com/oneapi-src/level-zero.git
26-
GIT_TAG v0.91.21
26+
GIT_TAG v1.0
2727
UPDATE_DISCONNECTED ${SYCL_EP_LEVEL_ZERO_LOADER_SKIP_AUTO_UPDATE}
2828
SOURCE_DIR ${LEVEL_ZERO_LOADER_SOURCE_DIR}
2929
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader_build"

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 493 additions & 296 deletions
Large diffs are not rendered by default.

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 83 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <cassert>
2424
#include <cstring>
2525
#include <iostream>
26+
#include <list>
27+
#include <map>
2628
#include <memory>
2729
#include <mutex>
2830
#include <unordered_map>
@@ -76,18 +78,33 @@ struct _pi_platform {
7678
// Cache pi_devices for reuse
7779
std::vector<pi_device> PiDevicesCache;
7880
std::mutex PiDevicesCacheMutex;
81+
// Maximum Number of Command Lists that can be created.
82+
// This Value is initialized to 20000, but can be changed by the user
83+
// thru the environment variable SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE
84+
// ie SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE =10000.
85+
int ZeMaxCommandListCache = 0;
86+
87+
// Current number of L0 Command Lists created on this platform.
88+
// this number must not exceed ZeMaxCommandListCache.
89+
std::atomic<int> ZeGlobalCommandListCount{0};
7990
};
8091

8192
struct _pi_device : _pi_object {
8293
_pi_device(ze_device_handle_t Device, pi_platform Plt,
8394
bool isSubDevice = false)
84-
: ZeDevice{Device}, Platform{Plt}, ZeCommandListInit{nullptr},
85-
IsSubDevice{isSubDevice}, ZeDeviceProperties{},
86-
ZeDeviceComputeProperties{} {
95+
: ZeDevice{Device}, Platform{Plt}, IsSubDevice{isSubDevice},
96+
ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
8797
// NOTE: one must additionally call initialize() to complete
8898
// PI device creation.
8999
}
90100

101+
// Keep the ordinal of a "compute" commands group, where we send all
102+
// commands currently.
103+
// TODO[1.0]: discover "copy" command group as well to use for memory
104+
// copying operations exclusively.
105+
//
106+
uint32_t ZeComputeQueueGroupIndex;
107+
91108
// Initialize the entire PI device.
92109
pi_result initialize();
93110

@@ -97,23 +114,28 @@ struct _pi_device : _pi_object {
97114
// PI platform to which this device belongs.
98115
pi_platform Platform;
99116

100-
// Immediate Level Zero command list for this device, to be used for
101-
// initializations. To be created as:
102-
// - Immediate command list: So any command appended to it is immediately
103-
// offloaded to the device.
104-
// - Synchronous: So implicit synchronization is made inside the level-zero
105-
// driver.
106-
ze_command_list_handle_t ZeCommandListInit;
117+
// Mutex Lock for the Command List Cache
118+
std::mutex ZeCommandListCacheMutex;
119+
// Cache of all currently Available Command Lists for use by PI APIs
120+
std::list<ze_command_list_handle_t> ZeCommandListCache;
107121

108122
// Indicates if this is a root-device or a sub-device.
109123
// Technically this information can be queried from a device handle, but it
110124
// seems better to just keep it here.
111125
bool IsSubDevice;
112126

113-
// Create a new command list for executing on this device.
114-
// It's caller's responsibility to remember and destroy the created
115-
// command list when no longer needed.
116-
pi_result createCommandList(ze_command_list_handle_t *ze_command_list);
127+
// Retrieves a command list for executing on this device along with
128+
// a fence to be used in tracking the execution of this command list.
129+
// If a command list has been created on this device which has
130+
// completed its commands, then that command list and its associated fence
131+
// will be reused. Otherwise, a new command list and fence will be created for
132+
// running on this device. L0 fences are created on a L0 command queue so the
133+
// caller must pass a command queue to create a new fence for the new command
134+
// list if a command list/fence pair is not available. All Command Lists &
135+
// associated fences are destroyed at Device Release.
136+
pi_result getAvailableCommandList(pi_queue Queue,
137+
ze_command_list_handle_t *ZeCommandList,
138+
ze_fence_handle_t *ZeFence);
117139

118140
// Cache of the immutable device properties.
119141
ze_device_properties_t ZeDeviceProperties;
@@ -122,14 +144,27 @@ struct _pi_device : _pi_object {
122144

123145
struct _pi_context : _pi_object {
124146
_pi_context(pi_device Device)
125-
: Device{Device}, ZeEventPool{nullptr}, NumEventsAvailableInEventPool{},
126-
NumEventsLiveInEventPool{} {}
147+
: Device{Device}, ZeCommandListInit{nullptr}, ZeEventPool{nullptr},
148+
NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {}
149+
150+
// A L0 context handle is primarily used during creation and management of
151+
// resources that may be used by multiple devices.
152+
ze_context_handle_t ZeContext;
127153

128-
// Level Zero does not have notion of contexts.
129154
// Keep the device here (must be exactly one) to return it when PI context
130155
// is queried for devices.
131156
pi_device Device;
132157

158+
// Immediate Level Zero command list for the device in this context, to be
159+
// used for initializations. To be created as:
160+
// - Immediate command list: So any command appended to it is immediately
161+
// offloaded to the device.
162+
// - Synchronous: So implicit synchronization is made inside the level-zero
163+
// driver.
164+
// There will be a list of immediate command lists (for each device) when
165+
// support of the multiple devices per context will be added.
166+
ze_command_list_handle_t ZeCommandListInit;
167+
133168
// Get index of the free slot in the available pool. If there is no avialble
134169
// pool then create new one.
135170
ze_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &,
@@ -169,25 +204,46 @@ struct _pi_context : _pi_object {
169204
};
170205

171206
struct _pi_queue : _pi_object {
172-
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context)
173-
: ZeCommandQueue{Queue}, Context{Context} {}
207+
_pi_queue(ze_command_queue_handle_t Queue, pi_context Context,
208+
pi_device Device)
209+
: ZeCommandQueue{Queue}, Context{Context}, Device{Device} {}
174210

175211
// Level Zero command queue handle.
176212
ze_command_queue_handle_t ZeCommandQueue;
177213

178214
// Keeps the PI context to which this queue belongs.
179215
pi_context Context;
180216

217+
// Mutex Lock for the Command List, Fence Map
218+
std::mutex ZeCommandListFenceMapMutex;
219+
// Map of all Command lists created with their associated Fence used for
220+
// tracking when the command list is available for use again.
221+
std::map<ze_command_list_handle_t, ze_fence_handle_t> ZeCommandListFenceMap;
222+
223+
// Resets the Command List and Associated fence in the ZeCommandListFenceMap.
224+
// If the reset command list should be made available, then MakeAvailable
225+
// needs to be set to true. The caller must verify that this command list and
226+
// fence have been signalled and call while holding the
227+
// ZeCommandListFenceMapMutex lock.
228+
pi_result resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
229+
bool MakeAvailable);
230+
231+
// Keeps the PI device to which this queue belongs.
232+
pi_device Device;
233+
181234
// Attach a command list to this queue, close, and execute it.
182235
// Note that this command list cannot be appended to after this.
183236
// The "is_blocking" tells if the wait for completion is requested.
237+
// The "ZeFence" passed is used to track when the command list passed
238+
// has completed execution on the device and can be reused.
184239
pi_result executeCommandList(ze_command_list_handle_t ZeCommandList,
240+
ze_fence_handle_t ZeFence,
185241
bool is_blocking = false);
186242
};
187243

188244
struct _pi_mem : _pi_object {
189-
// Keeps the PI platform of this memory handle.
190-
pi_platform Platform;
245+
// Keeps the PI context of this memory handle.
246+
pi_context Context;
191247

192248
// Keeps the host pointer where the buffer will be mapped to,
193249
// if created with PI_MEM_FLAGS_HOST_PTR_USE (see
@@ -221,8 +277,8 @@ struct _pi_mem : _pi_object {
221277
pi_result removeMapping(void *MappedTo, Mapping &MapInfo);
222278

223279
protected:
224-
_pi_mem(pi_platform Plt, char *HostPtr)
225-
: Platform{Plt}, MapHostPtr{HostPtr}, Mappings{} {}
280+
_pi_mem(pi_context Ctx, char *HostPtr)
281+
: Context{Ctx}, MapHostPtr{HostPtr}, Mappings{} {}
226282

227283
private:
228284
// The key is the host pointer representing an active mapping.
@@ -237,9 +293,9 @@ struct _pi_mem : _pi_object {
237293

238294
struct _pi_buffer final : _pi_mem {
239295
// Buffer/Sub-buffer constructor
240-
_pi_buffer(pi_platform Plt, char *Mem, char *HostPtr,
296+
_pi_buffer(pi_context Ctx, char *Mem, char *HostPtr,
241297
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0)
242-
: _pi_mem(Plt, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {}
298+
: _pi_mem(Ctx, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {}
243299

244300
void *getZeHandle() override { return ZeMem; }
245301

@@ -262,8 +318,8 @@ struct _pi_buffer final : _pi_mem {
262318

263319
struct _pi_image final : _pi_mem {
264320
// Image constructor
265-
_pi_image(pi_platform Plt, ze_image_handle_t Image, char *HostPtr)
266-
: _pi_mem(Plt, HostPtr), ZeImage{Image} {}
321+
_pi_image(pi_context Ctx, ze_image_handle_t Image, char *HostPtr)
322+
: _pi_mem(Ctx, HostPtr), ZeImage{Image} {}
267323

268324
void *getZeHandle() override { return ZeImage; }
269325

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -777,12 +777,15 @@ ProgramManager::ProgramPtr ProgramManager::build(
777777
LinkOpts = LinkOptions.c_str();
778778
}
779779

780-
// Level-Zero plugin doesn't support piProgramCompile/piProgramLink commands,
781-
// program is built during piProgramCreate.
782-
// TODO: remove this check as soon as piProgramCompile/piProgramLink will be
783-
// implemented in Level-Zero plugin.
784-
if (Context->getPlugin().getBackend() == backend::level_zero) {
785-
LinkDeviceLibs = false;
780+
// The Level Zero driver support for online linking currently has bugs, but
781+
// we think the DPC++ runtime support is ready. This environment variable
782+
// gates the runtime support for online linking, so we can try enabling if a
783+
// new driver is released before the next DPC++ release.
784+
static bool EnableLevelZeroLink = std::getenv("SYCL_ENABLE_LEVEL_ZERO_LINK");
785+
if (!EnableLevelZeroLink) {
786+
if (Context->getPlugin().getBackend() == backend::level_zero) {
787+
LinkDeviceLibs = false;
788+
}
786789
}
787790

788791
// TODO: this is a temporary workaround for GPU tests for ESIMD compiler.

sycl/test/basic_tests/buffer/buffer_full_copy.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// XFAIL: windows && level_zero
12
// RUN: %clangxx %s -o %t1.out -lsycl -I %sycl_include
23
// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
34
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t2.out
@@ -6,8 +7,6 @@
67
// RUN: %GPU_RUN_PLACEHOLDER %t2.out
78
// RUN: %ACC_RUN_PLACEHOLDER %t2.out
89

9-
// XFAIL: level_zero
10-
1110
//==------------- buffer_full_copy.cpp - SYCL buffer basic test ------------==//
1211
//
1312
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

sycl/test/basic_tests/event_profiling_info.cpp

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1+
// XFAIL: *
12
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
23
//
3-
// Profiling info is not supported on host device so far.
4-
//
54
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
65
// RUN: %CPU_RUN_PLACEHOLDER %t.out
76
// RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -17,16 +16,9 @@
1716
#include <CL/sycl.hpp>
1817
#include <cassert>
1918

20-
using namespace cl;
21-
22-
// The test checks that get_profiling_info waits for command asccociated with
23-
// event to complete execution.
24-
int main() {
25-
sycl::queue Q{sycl::property::queue::enable_profiling()};
26-
sycl::event Event = Q.submit([&](sycl::handler &CGH) {
27-
CGH.single_task<class EmptyKernel>([=]() {});
28-
});
19+
using namespace cl::sycl;
2920

21+
bool verifyProfiling(event Event) {
3022
auto Submit =
3123
Event.get_profiling_info<sycl::info::event_profiling::command_submit>();
3224
auto Start =
@@ -42,3 +34,39 @@ int main() {
4234

4335
return Fail;
4436
}
37+
38+
// The test checks that get_profiling_info waits for command asccociated with
39+
// event to complete execution.
40+
int main() {
41+
const size_t Size = 10000;
42+
int Data[Size] = {0};
43+
for (size_t I = 0; I < Size; ++I) {
44+
Data[I] = I;
45+
}
46+
int Values[Size] = {0};
47+
48+
buffer<int, 1> BufferFrom(Data, range<1>(Size));
49+
buffer<int, 1> BufferTo(Values, range<1>(Size));
50+
51+
// buffer copy
52+
queue copyQueue{sycl::property::queue::enable_profiling()};
53+
event copyEvent = copyQueue.submit([&](sycl::handler &Cgh) {
54+
accessor<int, 1, access::mode::read, access::target::global_buffer>
55+
AccessorFrom(BufferFrom, Cgh, range<1>(Size));
56+
accessor<int, 1, access::mode::write, access::target::global_buffer>
57+
AccessorTo(BufferTo, Cgh, range<1>(Size));
58+
Cgh.copy(AccessorFrom, AccessorTo);
59+
});
60+
61+
for (size_t I = 0; I < Size; ++I) {
62+
assert(Data[I] == Values[I]);
63+
}
64+
65+
// kernel launch
66+
queue kernelQueue{sycl::property::queue::enable_profiling()};
67+
event kernelEvent = kernelQueue.submit([&](sycl::handler &CGH) {
68+
CGH.single_task<class EmptyKernel>([=]() {});
69+
});
70+
71+
return verifyProfiling(copyEvent) || verifyProfiling(kernelEvent);
72+
}

sycl/test/basic_tests/image_accessor_readwrite.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
66
// RUN: %CPU_RUN_PLACEHOLDER %t.out
77
// RUN: %GPU_RUN_PLACEHOLDER %t.out
8-
//
9-
// XFAIL: windows && level_zero
108

119
//==--------------------image_accessor_readwrite.cpp ----------------------==//
1210
//==----------image_accessor read without sampler & write API test---------==//

sycl/test/basic_tests/image_accessor_readwrite_half.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
66
// RUN: %CPU_RUN_PLACEHOLDER %t.out
77
// RUN: %GPU_RUN_PLACEHOLDER %t.out
8-
//
9-
// XFAIL: windows && level_zero
108

119
//==--------------------image_accessor_readwrite_half.cpp -------------------==//
1210
//==-image_accessor read (without sampler)& write API test for half datatype-==//

sycl/test/plugins/sycl-ls-gpu-default.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
// RUN: sycl-ls --verbose >%t.default.out
44
// RUN: FileCheck %s --check-prefixes=CHECK-GPU-BUILTIN,CHECK-GPU-CUSTOM --input-file %t.default.out
55

6-
// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}GPU : 0.91
7-
// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}GPU : 0.91
6+
// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}GPU : 1.0
7+
// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}GPU : 1.0
88

99
//==-- sycl-ls-gpu-default.cpp - SYCL test for default selected gpu device -==//
1010
//

sycl/tools/get_device_count_by_type.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ static bool queryOpenCL(cl_device_type deviceType, cl_uint &deviceCount,
117117
static bool queryLevelZero(cl_device_type deviceType, cl_uint &deviceCount,
118118
std::string &msg) {
119119
deviceCount = 0u;
120-
ze_result_t zeResult = zeInit(ZE_INIT_FLAG_NONE);
120+
ze_result_t zeResult = zeInit(ZE_INIT_FLAG_GPU_ONLY);
121121
if (zeResult != ZE_RESULT_SUCCESS) {
122122
msg = "ERROR: Level Zero initialization error";
123123
return true;

0 commit comments

Comments
 (0)