From ba2ecb4b25a5e4e887e0b828f8f35cc56fc0ef32 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 19 May 2020 09:21:39 +0300 Subject: [PATCH 01/21] [SYCL] Introduce the Level Zero plugin Plugin itself consists of the header and the source file plus cmake file to build the plugin. Also the following changes were made to suport the Level Zero plugin in SYCL RT: * New level0 value was added to backend enum * New PI_LEVEL0 value support was added to SYCL_BE config. * Docs were updated. Mentioned Level Zero backend and provided the link to the Level Zero runtime for Intel GPU. * Changes in sycl cmake file to build level0 plugin by default and to install it with sycl toolchain. LIT testing with PI_LEVEL0 backend will be enabled in the following commits. This commits introduces the plugin and makes it buildable. Signed-off-by: Artur Gainullin --- sycl/CMakeLists.txt | 1 + sycl/doc/EnvironmentVariables.md | 2 +- sycl/doc/GetStartedGuide.md | 8 +- sycl/include/CL/sycl/backend_types.hpp | 2 +- sycl/include/CL/sycl/detail/pi.hpp | 2 + sycl/plugins/CMakeLists.txt | 1 + sycl/plugins/Intel_level0/CMakeLists.txt | 61 + sycl/plugins/Intel_level0/pi_level0.cpp | 3723 ++++++++++++++++++++++ sycl/plugins/Intel_level0/pi_level0.hpp | 346 ++ sycl/source/detail/config.hpp | 8 +- sycl/source/detail/pi.cpp | 2 + 11 files changed, 4149 insertions(+), 7 deletions(-) create mode 100755 sycl/plugins/Intel_level0/CMakeLists.txt create mode 100755 sycl/plugins/Intel_level0/pi_level0.cpp create mode 100755 sycl/plugins/Intel_level0/pi_level0.hpp diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 0b3cb1e94acfb..fc0d81428c4b4 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -304,6 +304,7 @@ set( SYCL_TOOLCHAIN_DEPLOY_COMPONENTS sycl-headers-extras sycl pi_opencl + pi_level0 libsycldevice ) if(OpenCL_INSTALL_KHRONOS_ICD_LOADER AND TARGET ocl-icd) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 91b2b0e22eb1c..f69635c0d4bdd 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -12,7 +12,7 @@ subject to change. Do not rely on these variables in production code. | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | SYCL_PI_TRACE | Described [below](#sycl_pi_trace-options) | Enable specified level of tracing for PI. | -| SYCL_BE | PI_OPENCL, PI_CUDA | Force SYCL RT to consider only devices of the specified backend during the device selection. | +| SYCL_BE | PI_OPENCL, PI_LEVEL0, PI_CUDA | Force SYCL RT to consider only devices of the specified backend during the device selection. | | SYCL_DEVICE_TYPE | One of: CPU, GPU, ACC, HOST | Force SYCL to use the specified device type. If unset, default selection rules are applied. If set to any unlisted value, this control has no effect. If the requested device type is not found, a `cl::sycl::runtime_error` exception is thrown. If a non-default device selector is used, a device must satisfy both the selector and this control to be chosen. This control only has effect on devices created with a selector. | | SYCL_PROGRAM_COMPILE_OPTIONS | String of valid OpenCL compile options | Override compile options for all programs. | | SYCL_PROGRAM_LINK_OPTIONS | String of valid OpenCL link options | Override link options for all programs. | diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 5242929b0f967..e7ddd196aed80 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -164,11 +164,15 @@ which contains all the symbols required. To run DPC++ applications on OpenCL devices, OpenCL implementation(s) must be present in the system. +To run DPC++ applications on Level Zero devices, Level Zero implementation(s) +must be present in the system. + Please, refer to [the Release Notes](../ReleaseNotes.md) for recommended Intel runtime versions. -The `GPU` runtime that is needed to run DPC++ application on Intel `GPU` devices -can be downloaded from the following web pages: +To run DPC++ application on Intel `GPU` devices the OpenCL `GPU` runtime or the +Level Zero `GPU` runtime is needed. They can be downloaded from the following web +pages: * Linux: [Intel® Graphics Compute Runtime for OpenCL™](https://github.com/intel/compute-runtime/releases) diff --git a/sycl/include/CL/sycl/backend_types.hpp b/sycl/include/CL/sycl/backend_types.hpp index 5e8e57ea70b2e..9411cc982393c 100644 --- a/sycl/include/CL/sycl/backend_types.hpp +++ b/sycl/include/CL/sycl/backend_types.hpp @@ -13,7 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -enum class backend { host, opencl, cuda }; +enum class backend { host, opencl, level0, cuda }; template struct interop; diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index d94ccaa3960fb..e608929487dd6 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -57,9 +57,11 @@ bool trace(TraceLevel level); #ifdef SYCL_RT_OS_WINDOWS #define OPENCL_PLUGIN_NAME "pi_opencl.dll" +#define LEVEL0_PLUGIN_NAME "pi_level0.dll" #define CUDA_PLUGIN_NAME "pi_cuda.dll" #else #define OPENCL_PLUGIN_NAME "libpi_opencl.so" +#define LEVEL0_PLUGIN_NAME "libpi_level0.so" #define CUDA_PLUGIN_NAME "libpi_cuda.so" #endif diff --git a/sycl/plugins/CMakeLists.txt b/sycl/plugins/CMakeLists.txt index 7dbd3d76a0842..b4aafc80eaa13 100644 --- a/sycl/plugins/CMakeLists.txt +++ b/sycl/plugins/CMakeLists.txt @@ -5,3 +5,4 @@ if(SYCL_BUILD_PI_CUDA) endif() add_subdirectory(opencl) +add_subdirectory(Intel_level0) diff --git a/sycl/plugins/Intel_level0/CMakeLists.txt b/sycl/plugins/Intel_level0/CMakeLists.txt new file mode 100755 index 0000000000000..1517bffb9f494 --- /dev/null +++ b/sycl/plugins/Intel_level0/CMakeLists.txt @@ -0,0 +1,61 @@ +# PI Level0 plugin library + +message(STATUS "Download Level Zero loader and headers from github.com") +if(MSVC) + set(L0_LIBRARY + "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_loader${CMAKE_STATIC_LIBRARY_SUFFIX}") +else() + set(L0_LIBRARY + "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}ze_loader${CMAKE_SHARED_LIBRARY_SUFFIX}") +endif() +if (CMAKE_C_COMPILER) + list(APPEND AUX_CMAKE_FLAGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}) +endif() +if (CMAKE_CXX_COMPILER) + list(APPEND AUX_CMAKE_FLAGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) +endif() +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build) +ExternalProject_Add(l0-loader + GIT_REPOSITORY https://github.com/oneapi-src/level-zero.git + GIT_TAG origin/master + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/Level0/l0_loader" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build" + INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_install" + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} + -DOpenCL_INCLUDE_DIR=${OpenCL_INCLUDE_DIRS} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_INSTALL_LIBDIR:PATH=lib${LLVM_LIBDIR_SUFFIX} + ${AUX_CMAKE_FLAGS} + STEP_TARGETS configure,build,install + DEPENDS ocl-headers + BUILD_BYPRODUCTS ${L0_LIBRARY} +) +ExternalProject_Add_Step(l0-loader llvminstall + COMMAND ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR} + COMMENT "Installing l0-loader into the LLVM binary directory" + DEPENDEES install +) + +include_directories("${sycl_inc_dir}") +include_directories(${OPENCL_INCLUDE}) + +add_library(pi_level0 SHARED + "${sycl_inc_dir}/CL/sycl/detail/pi.h" + "${CMAKE_CURRENT_SOURCE_DIR}/pi_level0.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/pi_level0.hpp" +) + +add_dependencies(pi_level0 l0-loader) +add_dependencies(sycl-toolchain pi_level0) + +target_link_libraries(pi_level0 PRIVATE "${L0_LIBRARY}") +if (UNIX) + target_link_libraries(pi_level0 PRIVATE pthread) +endif() + +add_common_options(pi_level0) + +install(TARGETS pi_level0 + LIBRARY DESTINATION "lib" COMPONENT pi_level0 + RUNTIME DESTINATION "bin" COMPONENT pi_level0) diff --git a/sycl/plugins/Intel_level0/pi_level0.cpp b/sycl/plugins/Intel_level0/pi_level0.cpp new file mode 100755 index 0000000000000..b84ea5a06d97b --- /dev/null +++ b/sycl/plugins/Intel_level0/pi_level0.cpp @@ -0,0 +1,3723 @@ +#include "pi_level0.hpp" +#include +#include +#include +#include +#include +#include + +#include + +// Controls L0 calls serialization to w/a of L0 driver being not MT ready. +// Recognized values (can be used as a bit mask): +enum { + ZeSerializeNone = + 0, // no locking or blocking (except when SYCL RT requested blocking) + ZeSerializeLock = 1, // locking around each ZE_CALL + ZeSerializeBlock = + 2, // blocking ZE calls, where supported (usually in enqueue commands) +}; +pi_uint32 ZeSerialize = 0; + +// This class encapsulates actions taken along with a call to L0 API. +class ZeCall { +private: + // The global mutex that is used for total serialization of L0 calls. + static std::mutex GlobalLock; + +public: + ZeCall() { + if ((ZeSerialize & ZeSerializeLock) != 0) { + GlobalLock.lock(); + } + } + ~ZeCall() { + if ((ZeSerialize & ZeSerializeLock) != 0) { + GlobalLock.unlock(); + } + } + + static ze_result_t check(ze_result_t ZeResult, const char *CallStr, + bool TraceError = true); + + // The non-static version just calls static one. + ze_result_t checkThis(ze_result_t ZeResult, const char *CallStr, + bool TraceError = true) { + return ZeCall::check(ZeResult, CallStr, TraceError); + } +}; +std::mutex ZeCall::GlobalLock; + +// Controls L0 calls tracing in zePrint. +bool ZeDebug = false; + +static void zePrint(const char *Format, ...) { + if (ZeDebug) { + va_list Args; + va_start(Args, Format); + vfprintf(stderr, Format, Args); + va_end(Args); + } +} + +// TODO:: In the following 4 methods we may want to distinguish read access vs. +// write (as it is OK for multiple threads to read the map without locking it). + +pi_result _pi_mem::addMapping(void *MappedTo, size_t Offset, size_t Size) { + std::lock_guard Lock(MappingsMutex); + auto It = Mappings.find(MappedTo); + if (It != Mappings.end()) { + zePrint("piEnqueueMemBufferMap: duplicate mapping detected\n"); + return PI_INVALID_OPERATION; + } else { + Mappings.insert({MappedTo, {Offset, Size}}); + } + return PI_SUCCESS; +} + +pi_result _pi_mem::removeMapping(void *MappedTo, Mapping &MapInfo) { + std::lock_guard Lock(MappingsMutex); + auto It = Mappings.find(MappedTo); + if (It == Mappings.end()) { + zePrint("piEnqueueMemUnmap: unknown memory mapping\n"); + return PI_INVALID_VALUE; + } + MapInfo = It->second; + Mappings.erase(It); + return PI_SUCCESS; +} + +ze_result_t +_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, + size_t &Index) { + // Maximum number of events that can be present in an event ZePool is captured + // here Setting it to 256 gave best possible performance for several + // benchmarks + static const char *MaxNumEventsPerPoolEnv = + std::getenv("MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + static const pi_uint32 MaxNumEventsPerPool = + (MaxNumEventsPerPoolEnv) ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + + Index = 0; + // Create one event ZePool per MaxNumEventsPerPool events + if ((ZeEventPool == nullptr) || + (NumEventsAvailableInEventPool[ZeEventPool] == 0)) { + // Creation of the new ZePool with record in NumEventsAvailableInEventPool + // and initialization of the record in NumEventsLiveInEventPool must be done + // atomically. Otherwise it is possible that decrementAliveEventsInPool will + // be called for the record in NumEventsLiveInEventPool before its + // initialization. + std::lock(NumEventsAvailableInEventPoolMutex, + NumEventsLiveInEventPoolMutex); + std::lock_guard NumEventsAvailableInEventPoolGuard( + NumEventsAvailableInEventPoolMutex, std::adopt_lock); + std::lock_guard NumEventsLiveInEventPoolGuard( + NumEventsLiveInEventPoolMutex, std::adopt_lock); + + ze_event_pool_desc_t ZeEventPoolDesc; + ZeEventPoolDesc.count = MaxNumEventsPerPool; + ZeEventPoolDesc.flags = ZE_EVENT_POOL_FLAG_TIMESTAMP; + ZeEventPoolDesc.version = ZE_EVENT_POOL_DESC_VERSION_CURRENT; + + ze_device_handle_t ZeDevice = Device->ZeDevice; + if (ze_result_t ZeRes = + zeEventPoolCreate(Device->Platform->ZeDriver, &ZeEventPoolDesc, 1, + &ZeDevice, &ZeEventPool)) + return ZeRes; + NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool - 1; + NumEventsLiveInEventPool[ZeEventPool] = MaxNumEventsPerPool; + } else { + std::lock_guard NumEventsAvailableInEventPoolGuard( + NumEventsAvailableInEventPoolMutex); + Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[ZeEventPool]; + --NumEventsAvailableInEventPool[ZeEventPool]; + } + ZePool = ZeEventPool; + return ZE_RESULT_SUCCESS; +} + +ze_result_t +_pi_context::decrementAliveEventsInPool(ze_event_pool_handle_t ZePool) { + std::lock_guard Lock(NumEventsLiveInEventPoolMutex); + --NumEventsLiveInEventPool[ZePool]; + if (NumEventsLiveInEventPool[ZePool] == 0) { + return zeEventPoolDestroy(ZePool); + } + return ZE_RESULT_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include +#include +#include + +// Some opencl extensions we know are supported by all Level0 devices. +#define ZE_SUPPORTED_EXTENSIONS \ + "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups " \ + "cl_intel_subgroups_short cl_intel_required_subgroup_size " + +// Map L0 runtime error code to PI error code +static pi_result mapError(ze_result_t ZeResult) { + // TODO: these mapping need to be clarified and synced with the PI API return + // values, which is TBD. + switch (ZeResult) { + case ZE_RESULT_SUCCESS: + return PI_SUCCESS; + case ZE_RESULT_ERROR_DEVICE_LOST: + return PI_DEVICE_NOT_FOUND; + case ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS: + return PI_INVALID_OPERATION; + case ZE_RESULT_ERROR_NOT_AVAILABLE: + return PI_INVALID_OPERATION; + case ZE_RESULT_ERROR_UNINITIALIZED: + return PI_INVALID_PLATFORM; + case ZE_RESULT_ERROR_INVALID_ARGUMENT: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_INVALID_NULL_POINTER: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_INVALID_SIZE: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_UNSUPPORTED_SIZE: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: + return PI_INVALID_EVENT; + case ZE_RESULT_ERROR_INVALID_ENUMERATION: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + return PI_INVALID_VALUE; + case ZE_RESULT_ERROR_INVALID_NATIVE_BINARY: + return PI_INVALID_BINARY; + case ZE_RESULT_ERROR_INVALID_KERNEL_NAME: + return PI_INVALID_KERNEL_NAME; + case ZE_RESULT_ERROR_INVALID_FUNCTION_NAME: + return PI_BUILD_PROGRAM_FAILURE; + case ZE_RESULT_ERROR_OVERLAPPING_REGIONS: + return PI_INVALID_OPERATION; + default: + return PI_ERROR_UNKNOWN; + } +} + +// Forward declarations +static pi_result +enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, + pi_bool BlockingWrite, size_t Size, const void *Src, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event); + +static pi_result enqueueMemCopyRectHelper( + pi_command_type CommandType, pi_queue Queue, void *SrcBuffer, + void *DstBuffer, const size_t *SrcOrigin, const size_t *DstOrigin, + const size_t *Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_bool Blocking, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event); + +inline void zeParseError(ze_result_t ZeError, std::string &ErrorString) { + switch (ZeError) { + case ZE_RESULT_SUCCESS: + ErrorString = "ZE_RESULT_SUCCESS"; + break; + case ZE_RESULT_NOT_READY: + ErrorString = "ZE_RESULT_NOT_READY"; + break; + case ZE_RESULT_ERROR_DEVICE_LOST: + ErrorString = "ZE_RESULT_ERROR_DEVICE_LOST"; + break; + case ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY: + ErrorString = "ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY"; + break; + case ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: + ErrorString = "ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; + break; + case ZE_RESULT_ERROR_MODULE_BUILD_FAILURE: + ErrorString = "ZE_RESULT_ERROR_MODULE_BUILD_FAILURE"; + break; + case ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS: + ErrorString = "ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS"; + break; + case ZE_RESULT_ERROR_NOT_AVAILABLE: + ErrorString = "ZE_RESULT_ERROR_NOT_AVAILABLE"; + break; + case ZE_RESULT_ERROR_UNINITIALIZED: + ErrorString = "ZE_RESULT_ERROR_UNINITIALIZED"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_VERSION: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_VERSION"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_FEATURE: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_FEATURE"; + break; + case ZE_RESULT_ERROR_INVALID_ARGUMENT: + ErrorString = "ZE_RESULT_ERROR_INVALID_ARGUMENT"; + break; + case ZE_RESULT_ERROR_INVALID_NULL_HANDLE: + ErrorString = "ZE_RESULT_ERROR_INVALID_NULL_HANDLE"; + break; + case ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE: + ErrorString = "ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; + break; + case ZE_RESULT_ERROR_INVALID_NULL_POINTER: + ErrorString = "ZE_RESULT_ERROR_INVALID_NULL_POINTER"; + break; + case ZE_RESULT_ERROR_INVALID_SIZE: + ErrorString = "ZE_RESULT_ERROR_INVALID_SIZE"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_SIZE: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_SIZE"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; + break; + case ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: + ErrorString = "ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; + break; + case ZE_RESULT_ERROR_INVALID_ENUMERATION: + ErrorString = "ZE_RESULT_ERROR_INVALID_ENUMERATION"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; + break; + case ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; + break; + case ZE_RESULT_ERROR_INVALID_NATIVE_BINARY: + ErrorString = "ZE_RESULT_ERROR_INVALID_NATIVE_BINARY"; + break; + case ZE_RESULT_ERROR_INVALID_GLOBAL_NAME: + ErrorString = "ZE_RESULT_ERROR_INVALID_GLOBAL_NAME"; + break; + case ZE_RESULT_ERROR_INVALID_KERNEL_NAME: + ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_NAME"; + break; + case ZE_RESULT_ERROR_INVALID_FUNCTION_NAME: + ErrorString = "ZE_RESULT_ERROR_INVALID_FUNCTION_NAME"; + break; + case ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: + ErrorString = "ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; + break; + case ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: + ErrorString = "ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; + break; + case ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: + ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; + break; + case ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: + ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; + break; + case ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: + ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; + break; + case ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE: + ErrorString = "ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE"; + break; + case ZE_RESULT_ERROR_OVERLAPPING_REGIONS: + ErrorString = "ZE_RESULT_ERROR_OVERLAPPING_REGIONS"; + break; + case ZE_RESULT_ERROR_UNKNOWN: + ErrorString = "ZE_RESULT_ERROR_UNKNOWN"; + break; + default: + assert("Unexpected Error code"); + } +} + +ze_result_t ZeCall::check(ze_result_t ZeResult, const char *CallStr, + bool TraceError) { + zePrint("ZE ---> %s\n", CallStr); + + if (ZeResult && TraceError) { + std::string ErrorString; + zeParseError(ZeResult, ErrorString); + zePrint("Error (%s) in %s\n", ErrorString.c_str(), CallStr); + } + return ZeResult; +} + +#define ZE_CALL(Call) \ + if (auto Result = ZeCall().checkThis(Call, #Call, true)) \ + return mapError(Result); +#define ZE_CALL_NOCHECK(Call) ZeCall().checkThis(Call, #Call, false) + +pi_result _pi_device::initialize() { + // Create the immediate command list to be used for initializations + // Created as synchronous so level-zero performs implicit synchronization and + // there is no need to query for completion in the plugin + ze_command_queue_desc_t ZeCommandQueueDesc = {}; + ZeCommandQueueDesc.version = ZE_COMMAND_QUEUE_DESC_VERSION_CURRENT; + ZeCommandQueueDesc.ordinal = 0; + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + ZE_CALL(zeCommandListCreateImmediate(ZeDevice, &ZeCommandQueueDesc, + &ZeCommandListInit)); + // Cache device properties + ZeDeviceProperties.version = ZE_DEVICE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetProperties(ZeDevice, &ZeDeviceProperties)); + ZeDeviceComputeProperties.version = + ZE_DEVICE_COMPUTE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetComputeProperties(ZeDevice, &ZeDeviceComputeProperties)); + return PI_SUCCESS; +} + +// Crate a new command list to be used in a PI call +pi_result +_pi_device::createCommandList(ze_command_list_handle_t *ZeCommandList) { + // Create the command list, because in L0 commands are added to + // the command lists, and later are then added to the command queue. + // + // TODO: Fugire out how to lower the overhead of creating a new list + // for each PI command, if that appears to be important. + // + ze_command_list_desc_t ZeCommandListDesc = {}; + ZeCommandListDesc.version = ZE_COMMAND_LIST_DESC_VERSION_CURRENT; + + // TODO: can we just reset the command-list created when an earlier + // command was submitted to the queue? + // + ZE_CALL(zeCommandListCreate(ZeDevice, &ZeCommandListDesc, ZeCommandList)); + + return PI_SUCCESS; +} + +pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList, + bool IsBlocking) { + // Close the command list and have it ready for dispatch. + ZE_CALL(zeCommandListClose(ZeCommandList)); + // Offload command list to the GPU for asynchronous execution + ZE_CALL(zeCommandQueueExecuteCommandLists(ZeCommandQueue, 1, &ZeCommandList, + nullptr)); + + // Check global control to make every command blocking for debugging. + if (IsBlocking || (ZeSerialize & ZeSerializeBlock) != 0) { + // Wait until command lists attached to the command queue are executed. + ZE_CALL(zeCommandQueueSynchronize(ZeCommandQueue, UINT32_MAX)); + } + return PI_SUCCESS; +} + +ze_event_handle_t *_pi_event::createZeEventList(pi_uint32 EventListLength, + const pi_event *EventList) { + ze_event_handle_t *ZeEventList = new ze_event_handle_t[EventListLength]; + + for (pi_uint32 I = 0; I < EventListLength; I++) { + ZeEventList[I] = EventList[I]->ZeEvent; + } + return ZeEventList; +} + +void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { + delete[] ZeEventList; +} + +// Forward declararitons +decltype(piEventCreate) piEventCreate; + +// No generic lambdas in C++11, so use this convinence macro. +// NOTE: to be used in API returning "ParamValue". +// NOTE: memset is used to clear all bytes in the memory allocated by SYCL RT +// for value. This is a workaround for the problem when return type of the +// parameter is incorrect in L0 plugin which can result in bad value. This +// memset can be removed if it is necessary. +#define SET_PARAM_VALUE(Value) \ + { \ + typedef decltype(Value) T; \ + if (ParamValue) { \ + memset(ParamValue, 0, ParamValueSize); \ + *(T *)ParamValue = Value; \ + } \ + if (ParamValueSizeRet) \ + *ParamValueSizeRet = sizeof(T); \ + } +#define SET_PARAM_VALUE_STR(Value) \ + { \ + if (ParamValue) \ + memcpy(ParamValue, Value, ParamValueSize); \ + if (ParamValueSizeRet) \ + *ParamValueSizeRet = strlen(Value) + 1; \ + } + +#define SET_PARAM_VALUE_VLA(Value, NumValues, RetType) \ + { \ + if (ParamValue) { \ + memset(ParamValue, 0, ParamValueSize); \ + for (uint32_t I = 0; I < NumValues; I++) \ + ((RetType *)ParamValue)[I] = (RetType)Value[I]; \ + } \ + if (ParamValueSizeRet) \ + *ParamValueSizeRet = NumValues * sizeof(RetType); \ + } + +#ifndef _WIN32 +// Recover from Linux SIGSEGV signal. +// We can't reliably catch C++ exceptions thrown from signal +// handler so use setjmp/longjmp. +// +#include +#include +jmp_buf ReturnHere; +static void piSignalHandler(int SigNum) { + // We are somewhere the signall was raised, so go back to + // where we started tracking. + longjmp(ReturnHere, 0); +} +// Only handle segfault now, but can be extended. +#define __TRY() \ + signal(SIGSEGV, &piSignalHandler); \ + if (!setjmp(ReturnHere)) { +#define __CATCH() \ + } \ + else { +#define __FINALLY() \ + } \ + signal(SIGSEGV, SIG_DFL); + +#else // _WIN32 +// TODO: on Windows we could use structured exception handling. +// Just dummy implementation now (meaning no error handling). +#define __TRY() if (true) { +#define __CATCH() \ + } \ + else { +#define __FINALLY() } +#endif // _WIN32 + +pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, + pi_uint32 *NumPlatforms) { + + static const char *DebugMode = std::getenv("ZeDebug"); + if (DebugMode) + ZeDebug = true; + + static const char *SerializeMode = std::getenv("ZeSerialize"); + static const pi_uint32 SerializeModeValue = + SerializeMode ? std::atoi(SerializeMode) : 0; + ZeSerialize = SerializeModeValue; + + if (NumEntries == 0 && Platforms != nullptr) { + return PI_INVALID_VALUE; + } + if (Platforms == nullptr && NumPlatforms == nullptr) { + return PI_INVALID_VALUE; + } + + ze_result_t ZeResult; + // This is a good time to initialize L0. + // We can still safely recover if something goes wrong during the init. + // + // NOTE: for some reason only first segfault is reliably handled, + // so remember it, and avoid calling zeInit again. + // + // TODO: we should not call zeInit multiples times ever, so + // this code should be changed. + // + static bool SegFault = false; + __TRY() { + ZeResult = SegFault ? ZE_RESULT_ERROR_UNINITIALIZED + : ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); + } + __CATCH() { + SegFault = true; + zePrint("L0 raised segfault: assume no Platforms\n"); + ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; + } + __FINALLY() + + // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. + if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + assert(NumPlatforms != 0); + *NumPlatforms = 0; + return PI_SUCCESS; + } + + if (auto Res = ZeCall::check(ZeResult, "zeInit")) { + return mapError(Res); + } + + // L0 does not have concept of Platforms, but L0 driver is the + // closest match. + // + if (Platforms && NumEntries > 0) { + uint32_t ZeDriverCount = 0; + ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr)); + if (ZeDriverCount == 0) { + assert(NumPlatforms != 0); + *NumPlatforms = 0; + return PI_SUCCESS; + } + + ze_driver_handle_t ZeDriver; + assert(ZeDriverCount == 1); + ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver)); + + // TODO: figure out how/when to release this memory + *Platforms = new _pi_platform(ZeDriver); + + // Cache driver properties + ze_driver_properties_t ZeDriverProperties; + ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties)); + uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion; + // Intel Level-Zero GPU driver stores version as: + // | 31 - 24 | 23 - 16 | 15 - 0 | + // | Major | Minor | Build | + std::string VersionMajor = + std::to_string((ZeDriverVersion & 0xFF000000) >> 24); + std::string VersionMinor = + std::to_string((ZeDriverVersion & 0x00FF0000) >> 16); + std::string VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF); + Platforms[0]->ZeDriverVersion = VersionMajor + std::string(".") + + VersionMinor + std::string(".") + + VersionBuild; + + ze_api_version_t ZeApiVersion; + ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion)); + Platforms[0]->ZeDriverApiVersion = + std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + std::string(".") + + std::to_string(ZE_MINOR_VERSION(ZeApiVersion)); + } + + if (NumPlatforms) + *NumPlatforms = 1; + + return PI_SUCCESS; +} + +pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Platform); + zePrint("==========================\n"); + zePrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str()); + zePrint("==========================\n"); + + switch (ParamName) { + case PI_PLATFORM_INFO_NAME: + // TODO: Query L0 driver when relevant info is added there. + SET_PARAM_VALUE_STR("Intel(R) Level-Zero"); + break; + case PI_PLATFORM_INFO_VENDOR: + // TODO: Query L0 driver when relevant info is added there. + SET_PARAM_VALUE_STR("Intel(R) Corporation"); + break; + case PI_PLATFORM_INFO_EXTENSIONS: + // Convention adopted from OpenCL: + // "Returns a space-separated list of extension names (the extension + // names themselves do not contain any spaces) supported by the platform. + // Extensions defined here must be supported by all devices associated + // with this platform." + // + // TODO: Check the common extensions supported by all connected devices and + // return them. For now, hardcoding some extensions we know are supported by + // all Level0 devices. + SET_PARAM_VALUE_STR(ZE_SUPPORTED_EXTENSIONS); + break; + case PI_PLATFORM_INFO_PROFILE: + // TODO: figure out what this means and how is this used + SET_PARAM_VALUE_STR("FULL_PROFILE"); + break; + case PI_PLATFORM_INFO_VERSION: + // TODO: this should query to zeDriverGetDriverVersion + // but we don't yet have the driver handle here. + // + // From OpenCL 2.1: "This version string has the following format: + // OpenCL. Follow the same notation here. + // + SET_PARAM_VALUE_STR(Platform->ZeDriverApiVersion.c_str()); + break; + default: + // TODO: implement other parameters + die("Unsupported ParamName in piPlatformGetInfo"); + } + + return PI_SUCCESS; +} + +pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, + pi_uint32 NumEntries, pi_device *Devices, + pi_uint32 *NumDevices) { + + assert(Platform); + ze_driver_handle_t ZeDriver = Platform->ZeDriver; + + // Get number of devices supporting L0 + uint32_t ZeDeviceCount = 0; + const bool AskingForGPU = (DeviceType & PI_DEVICE_TYPE_GPU); + ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr)); + if (ZeDeviceCount == 0 || !AskingForGPU) { + if (NumDevices) + *NumDevices = 0; + return PI_SUCCESS; + } + + if (NumDevices) + *NumDevices = ZeDeviceCount; + + // TODO: Delete array at teardown + ze_device_handle_t *ZeDevices = new ze_device_handle_t[ZeDeviceCount]; + ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices)); + + for (uint32_t I = 0; I < ZeDeviceCount; ++I) { + // TODO: add check for device type + if (I < NumEntries) { + Devices[I] = new _pi_device(ZeDevices[I], Platform); + pi_result Result = Devices[I]->initialize(); + if (Result != PI_SUCCESS) { + return Result; + } + } + } + return PI_SUCCESS; +} + +pi_result piDeviceRetain(pi_device Device) { + assert(Device); + + // The root-device ref-count remains unchanged (always 1). + if (Device->IsSubDevice) { + ++(Device->RefCount); + } + return PI_SUCCESS; +} + +pi_result piDeviceRelease(pi_device Device) { + assert(Device); + + // TODO: OpenCL says root-device ref-count remains unchanged (1), + // but when would we free the device's data? + // + if (--(Device->RefCount) == 0) { + // Destroy the command list used for initializations + ZE_CALL(zeCommandListDestroy(Device->ZeCommandListInit)); + delete Device; + } + + return PI_SUCCESS; +} + +pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Device != nullptr); + + ze_device_handle_t ZeDevice = Device->ZeDevice; + + uint32_t ZeAvailMemCount = 0; + ZE_CALL(zeDeviceGetMemoryProperties(ZeDevice, &ZeAvailMemCount, nullptr)); + // Confirm at least one memory is available in the device + assert(ZeAvailMemCount > 0); + ze_device_memory_properties_t *ZeDeviceMemoryProperties = + new ze_device_memory_properties_t[ZeAvailMemCount](); + for (uint32_t I = 0; I < ZeAvailMemCount; I++) { + ZeDeviceMemoryProperties[I].version = + ZE_DEVICE_MEMORY_PROPERTIES_VERSION_CURRENT; + } + // TODO: cache various device properties in the PI device object, + // and initialize them only upon they are first requested. + // + ZE_CALL(zeDeviceGetMemoryProperties(ZeDevice, &ZeAvailMemCount, + ZeDeviceMemoryProperties)); + + ze_device_image_properties_t ZeDeviceImageProperties; + ZeDeviceImageProperties.version = ZE_DEVICE_IMAGE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetImageProperties(ZeDevice, &ZeDeviceImageProperties)); + + ze_device_kernel_properties_t ZeDeviceKernelProperties; + ZeDeviceKernelProperties.version = + ZE_DEVICE_KERNEL_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetKernelProperties(ZeDevice, &ZeDeviceKernelProperties)); + + ze_device_cache_properties_t ZeDeviceCacheProperties; + ZeDeviceCacheProperties.version = ZE_DEVICE_CACHE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetCacheProperties(ZeDevice, &ZeDeviceCacheProperties)); + + switch (ParamName) { + case PI_DEVICE_INFO_TYPE: { + if (Device->ZeDeviceProperties.type == ZE_DEVICE_TYPE_GPU) { + SET_PARAM_VALUE(PI_DEVICE_TYPE_GPU); + } else { // ZE_DEVICE_TYPE_FPGA + zePrint("FPGA not supported\n"); + return PI_INVALID_VALUE; + } + break; + } + case PI_DEVICE_INFO_PARENT_DEVICE: + // TODO: all L0 devices are parent ? + SET_PARAM_VALUE(pi_device{0}); + break; + case PI_DEVICE_INFO_PLATFORM: + SET_PARAM_VALUE(Device->Platform); + break; + case PI_DEVICE_INFO_VENDOR_ID: + SET_PARAM_VALUE(pi_uint32{Device->ZeDeviceProperties.vendorId}); + break; + case PI_DEVICE_INFO_EXTENSIONS: { + // Convention adopted from OpenCL: + // "Returns a space separated list of extension names (the extension + // names themselves do not contain any spaces) supported by the device." + // + // TODO: Use proper mechanism to get this information from Level0 after + // it is added to Level0. + // Hardcoding the few we know are supported by the current hardware. + // + // + std::string SupportedExtensions; + + // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIRV support. Core + // feature in >OpenCL 2.1 + // cl_khr_subgroups - Extension adds support for implementation-controlled + // subgroups. + // cl_intel_subgroups - Extension adds subgroup features, defined by + // Intel. cl_intel_subgroups_short - Extension adds subgroup functions + // described in + // the cl_intel_subgroups extension to support 16-bit integer data types + // for performance. + // cl_intel_required_subgroup_size - Extension to allow programmers to + // optionally specify the required subgroup size for a kernel function. + // cl_khr_fp16 - Optional half floating-point support. + // cl_khr_fp64 - Support for double floating-point precision. + // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional + // extensions that implement atomic operations on 64-bit signed and + // unsigned integers to locations in __global and __local memory. + // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory + // objects. + // + // Hardcoding some extensions we know are supported by all Level0 devices. + SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS); + if (ZeDeviceKernelProperties.fp16Supported) + SupportedExtensions += ("cl_khr_fp16 "); + if (ZeDeviceKernelProperties.fp64Supported) + SupportedExtensions += ("cl_khr_fp64 "); + if (ZeDeviceKernelProperties.int64AtomicsSupported) + // int64AtomicsSupported indicates support for both. + SupportedExtensions += + ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics "); + if (ZeDeviceImageProperties.supported) + // Supports reading and writing of images. + SupportedExtensions += ("cl_khr_3d_image_writes "); + + SET_PARAM_VALUE_STR(SupportedExtensions.c_str()); + break; + } + case PI_DEVICE_INFO_NAME: + SET_PARAM_VALUE_STR(Device->ZeDeviceProperties.name); + break; + case PI_DEVICE_INFO_COMPILER_AVAILABLE: + SET_PARAM_VALUE(pi_bool{1}); + break; + case PI_DEVICE_INFO_LINKER_AVAILABLE: + SET_PARAM_VALUE(pi_bool{1}); + break; + case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: { + pi_uint32 MaxComputeUnits = + Device->ZeDeviceProperties.numEUsPerSubslice * + Device->ZeDeviceProperties.numSubslicesPerSlice * + Device->ZeDeviceProperties.numSlices; + SET_PARAM_VALUE(pi_uint32{MaxComputeUnits}); + break; + } + case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: + // L0 spec defines only three dimensions + SET_PARAM_VALUE(pi_uint32{3}); + break; + case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: + SET_PARAM_VALUE( + pi_uint64{Device->ZeDeviceComputeProperties.maxTotalGroupSize}); + break; + case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { + struct { + size_t Arr[3]; + } MaxGroupSize = {{Device->ZeDeviceComputeProperties.maxGroupSizeX, + Device->ZeDeviceComputeProperties.maxGroupSizeY, + Device->ZeDeviceComputeProperties.maxGroupSizeZ}}; + SET_PARAM_VALUE(MaxGroupSize); + break; + } + case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + SET_PARAM_VALUE(pi_uint32{Device->ZeDeviceProperties.coreClockRate}); + break; + case PI_DEVICE_INFO_ADDRESS_BITS: { + // TODO: To confirm with spec. + SET_PARAM_VALUE(pi_uint32{64}); + break; + } + case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { + // TODO: To confirm with spec. + uint32_t MaxMemAllocSize = 0; + for (uint32_t I = 0; I < ZeAvailMemCount; I++) { + MaxMemAllocSize += ZeDeviceMemoryProperties[I].totalSize; + } + SET_PARAM_VALUE(pi_uint64{MaxMemAllocSize}); + break; + } + case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: { + uint32_t GlobalMemSize = 0; + for (uint32_t I = 0; I < ZeAvailMemCount; I++) { + GlobalMemSize += ZeDeviceMemoryProperties[I].totalSize; + } + SET_PARAM_VALUE(pi_uint64{GlobalMemSize}); + break; + } + case PI_DEVICE_INFO_LOCAL_MEM_SIZE: + SET_PARAM_VALUE( + pi_uint64{Device->ZeDeviceComputeProperties.maxSharedLocalMemory}); + break; + case PI_DEVICE_INFO_IMAGE_SUPPORT: + SET_PARAM_VALUE(pi_bool{ZeDeviceImageProperties.supported}); + break; + case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: + SET_PARAM_VALUE(pi_bool{Device->ZeDeviceProperties.unifiedMemorySupported}); + break; + case PI_DEVICE_INFO_AVAILABLE: + SET_PARAM_VALUE(pi_bool{ZeDevice ? true : false}); + break; + case PI_DEVICE_INFO_VENDOR: + // TODO: Level-Zero does not return vendor's name at the moment + // only the ID. + SET_PARAM_VALUE_STR("Intel(R) Corporation"); + break; + case PI_DEVICE_INFO_DRIVER_VERSION: + SET_PARAM_VALUE_STR(Device->Platform->ZeDriverVersion.c_str()); + break; + case PI_DEVICE_INFO_VERSION: + SET_PARAM_VALUE_STR(Device->Platform->ZeDriverApiVersion.c_str()); + break; + case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { + uint32_t ZeSubDeviceCount = 0; + ZE_CALL(zeDeviceGetSubDevices(ZeDevice, &ZeSubDeviceCount, nullptr)); + SET_PARAM_VALUE(pi_uint32{ZeSubDeviceCount}); + break; + } + case PI_DEVICE_INFO_REFERENCE_COUNT: + SET_PARAM_VALUE(pi_uint32{Device->RefCount}); + break; + case PI_DEVICE_INFO_PARTITION_PROPERTIES: { + // + // It is debatable if SYCL sub-device and partitioning APIs sufficient to + // expose Level0 sub-devices? We start with support of + // "partition_by_affinity_domain" and "numa" but if that doesn't seem to + // be a good fit we could look at adding a more descriptive partitioning + // type. + // + struct { + pi_device_partition_property Arr[2]; + } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}}; + SET_PARAM_VALUE(PartitionProperties); + break; + } + case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: + SET_PARAM_VALUE(pi_device_affinity_domain{ + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); + break; + case PI_DEVICE_INFO_PARTITION_TYPE: { + if (Device->IsSubDevice) { + struct { + pi_device_partition_property Arr[3]; + } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, + 0}}; + SET_PARAM_VALUE(PartitionProperties); + } else { + // For root-device there is no partitioning to report. + SET_PARAM_VALUE(pi_device_partition_property{0}); + } + break; + } + + // Everything under here is not supported yet + + case PI_DEVICE_INFO_OPENCL_C_VERSION: + SET_PARAM_VALUE_STR(""); + break; + case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: + SET_PARAM_VALUE(pi_bool{true}); + break; + case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: + SET_PARAM_VALUE(size_t{ZeDeviceKernelProperties.printfBufferSize}); + break; + case PI_DEVICE_INFO_PROFILE: + SET_PARAM_VALUE_STR("FULL_PROFILE"); + break; + case PI_DEVICE_INFO_BUILT_IN_KERNELS: + // TODO: To find out correct value + SET_PARAM_VALUE_STR(""); + break; + case PI_DEVICE_INFO_QUEUE_PROPERTIES: + SET_PARAM_VALUE(pi_queue_properties{PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + PI_QUEUE_PROFILING_ENABLE}); + break; + case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: + SET_PARAM_VALUE( + pi_device_exec_capabilities{PI_DEVICE_EXEC_CAPABILITIES_NATIVE_KERNEL}); + break; + case PI_DEVICE_INFO_ENDIAN_LITTLE: + SET_PARAM_VALUE(pi_bool{true}); + break; + case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: + SET_PARAM_VALUE(pi_bool{Device->ZeDeviceProperties.eccMemorySupported}); + break; + case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: + SET_PARAM_VALUE(size_t{Device->ZeDeviceProperties.timerResolution}); + break; + case PI_DEVICE_INFO_LOCAL_MEM_TYPE: + SET_PARAM_VALUE(PI_DEVICE_LOCAL_MEM_TYPE_LOCAL); + break; + case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: + SET_PARAM_VALUE(pi_uint32{64}); + break; + case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: + SET_PARAM_VALUE(pi_uint64{ZeDeviceImageProperties.maxImageBufferSize}); + break; + case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: + SET_PARAM_VALUE(PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); + break; + case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: + SET_PARAM_VALUE(pi_uint32{ZeDeviceCacheProperties.lastLevelCachelineSize}); + break; + case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: + SET_PARAM_VALUE(pi_uint64{ZeDeviceCacheProperties.lastLevelCacheSize}); + break; + case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: + SET_PARAM_VALUE(size_t{ZeDeviceKernelProperties.maxArgumentsSize}); + break; + case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: + // SYCL/OpenCL spec is vague on what this means exactly, but seems to + // be for "alignment requirement (in bits) for sub-buffer offsets." + // An OpenCL implementation returns 8*128, but L0 can do just 8, + // meaning unaligned access for values of types larger than 8 bits. + // + SET_PARAM_VALUE(pi_uint32{8}); + break; + case PI_DEVICE_INFO_MAX_SAMPLERS: + SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxSamplers}); + break; + case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: + SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxReadImageArgs}); + break; + case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: + SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxWriteImageArgs}); + break; + case PI_DEVICE_INFO_SINGLE_FP_CONFIG: { + uint64_t SingleFPValue = 0; + ze_fp_capabilities_t ZeSingleFPCapabilities = + ZeDeviceKernelProperties.singleFpCapabilities; + if (ZE_FP_CAPS_DENORM & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_DENORM; + } + if (ZE_FP_CAPS_INF_NAN & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_INF_NAN; + } + if (ZE_FP_CAPS_ROUND_TO_NEAREST & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_ROUND_TO_NEAREST; + } + if (ZE_FP_CAPS_ROUND_TO_ZERO & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_ROUND_TO_ZERO; + } + if (ZE_FP_CAPS_ROUND_TO_INF & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_ROUND_TO_INF; + } + if (ZE_FP_CAPS_FMA & ZeSingleFPCapabilities) { + SingleFPValue |= PI_FP_FMA; + } + SET_PARAM_VALUE(pi_uint64{SingleFPValue}); + break; + } + case PI_DEVICE_INFO_HALF_FP_CONFIG: { + uint64_t HalfFPValue = 0; + ze_fp_capabilities_t ZeHalfFPCapabilities = + ZeDeviceKernelProperties.halfFpCapabilities; + if (ZE_FP_CAPS_DENORM & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_DENORM; + } + if (ZE_FP_CAPS_INF_NAN & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_INF_NAN; + } + if (ZE_FP_CAPS_ROUND_TO_NEAREST & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_ROUND_TO_NEAREST; + } + if (ZE_FP_CAPS_ROUND_TO_ZERO & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_ROUND_TO_ZERO; + } + if (ZE_FP_CAPS_ROUND_TO_INF & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_ROUND_TO_INF; + } + if (ZE_FP_CAPS_FMA & ZeHalfFPCapabilities) { + HalfFPValue |= PI_FP_FMA; + } + SET_PARAM_VALUE(pi_uint64{HalfFPValue}); + break; + } + case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: { + uint64_t DoubleFPValue = 0; + ze_fp_capabilities_t ZeDoubleFPCapabilities = + ZeDeviceKernelProperties.doubleFpCapabilities; + if (ZE_FP_CAPS_DENORM & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_DENORM; + } + if (ZE_FP_CAPS_INF_NAN & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_INF_NAN; + } + if (ZE_FP_CAPS_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_ROUND_TO_NEAREST; + } + if (ZE_FP_CAPS_ROUND_TO_ZERO & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_ROUND_TO_ZERO; + } + if (ZE_FP_CAPS_ROUND_TO_INF & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_ROUND_TO_INF; + } + if (ZE_FP_CAPS_FMA & ZeDoubleFPCapabilities) { + DoubleFPValue |= PI_FP_FMA; + } + SET_PARAM_VALUE(pi_uint64{DoubleFPValue}); + break; + } + case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: + // Until L0 provides needed info, hardcode default minimum values required + // by the SYCL specification. + // + SET_PARAM_VALUE(size_t{8192}); + break; + case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: + // Until L0 provides needed info, hardcode default minimum values required + // by the SYCL specification. + // + SET_PARAM_VALUE(size_t{8192}); + break; + case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: + // Until L0 provides needed info, hardcode default minimum values required + // by the SYCL specification. + // + SET_PARAM_VALUE(size_t{2048}); + break; + case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: + // Until L0 provides needed info, hardcode default minimum values required + // by the SYCL specification. + // + SET_PARAM_VALUE(size_t{2048}); + break; + case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: + // Until L0 provides needed info, hardcode default minimum values required + // by the SYCL specification. + // + SET_PARAM_VALUE(size_t{2048}); + break; + case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: + SET_PARAM_VALUE(size_t{ZeDeviceImageProperties.maxImageBufferSize}); + break; + case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: + SET_PARAM_VALUE(size_t{ZeDeviceImageProperties.maxImageArraySlices}); + break; + // + // Handle SIMD widths. + // TODO: can we do better than this? + // + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 1); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); + break; + case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: + SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); + break; + case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Max_num_sub_Groups = + // maxTotalGroupSize/min(set + // of subGroupSizes); + uint32_t MinSubGroupSize = + Device->ZeDeviceComputeProperties.subGroupSizes[0]; + for (uint32_t I = 1; I < Device->ZeDeviceComputeProperties.numSubGroupSizes; + I++) { + if (MinSubGroupSize > Device->ZeDeviceComputeProperties.subGroupSizes[I]) + MinSubGroupSize = Device->ZeDeviceComputeProperties.subGroupSizes[I]; + } + SET_PARAM_VALUE(Device->ZeDeviceComputeProperties.maxTotalGroupSize / + MinSubGroupSize); + break; + } + case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { + // TODO: Not supported yet. Needs to be updated after support is added. + SET_PARAM_VALUE(pi_bool{false}); + break; + } + case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the + // expected return is size_t datatype. size_t can be 8 bytes of data. + SET_PARAM_VALUE_VLA(Device->ZeDeviceComputeProperties.subGroupSizes, + Device->ZeDeviceComputeProperties.numSubGroupSizes, + size_t); + break; + } + case PI_DEVICE_INFO_IL_VERSION: { + // Set to a space separated list of IL version strings of the form + // _.. + // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is + // reported. + uint32_t SpirvVersion = ZeDeviceKernelProperties.spirvVersionSupported; + uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion); + uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion); + + char SpirvVersionString[50]; + int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor, + SpirvVersionMinor); + // returned string to contain only len number of characters. + std::string ILVersion(SpirvVersionString, Len); + SET_PARAM_VALUE_STR(ILVersion.c_str()); + break; + } + case PI_DEVICE_INFO_USM_HOST_SUPPORT: + case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: + case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: + case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + pi_uint64 Supported = 0; + if (Device->ZeDeviceProperties.unifiedMemorySupported) { + // TODO: Use + // ze_memory_access_capabilities_t + Supported = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | + PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS; + } + SET_PARAM_VALUE(Supported); + break; + } + default: + zePrint("Unsupported ParamName in piGetDeviceInfo\n"); + zePrint("ParamName=%d(0x%x)\n", ParamName, ParamName); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piDevicePartition(pi_device Device, + const pi_device_partition_property *Properties, + pi_uint32 NumDevices, pi_device *OutDevices, + pi_uint32 *OutNumDevices) { + // Other partitioning ways are not supported by L0 + if (Properties[0] != PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN || + Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE) { + return PI_INVALID_VALUE; + } + + assert(Device); + // Get the number of subdevices available. + // TODO: maybe add interface to create the specified # of subdevices. + uint32_t Count = 0; + ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, nullptr)); + + // Check that the requested/allocated # of sub-devices is the same + // as was reported by the above call. + // TODO: we may want to support smaller/larger # devices too. + if (Count != NumDevices) { + zePrint("piDevicePartition: unsupported # of sub-devices requested\n"); + return PI_INVALID_OPERATION; + } + + if (OutNumDevices) { + *OutNumDevices = Count; + } + + if (!OutDevices) { + // If we are not given the buffer, we are done. + return PI_SUCCESS; + } + + auto ZeSubdevices = new ze_device_handle_t[Count]; + ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, ZeSubdevices)); + + // Wrap the L0 sub-devices into PI sub-devices, and write them out. + for (uint32_t I = 0; I < Count; ++I) { + OutDevices[I] = new _pi_device(ZeSubdevices[I], Device->Platform, + true /* isSubDevice */); + pi_result Result = OutDevices[I]->initialize(); + if (Result != PI_SUCCESS) { + delete[] ZeSubdevices; + return Result; + } + } + delete[] ZeSubdevices; + return PI_SUCCESS; +} + +pi_result +piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context? + pi_device_binary *Binaries, pi_uint32 NumBinaries, + pi_uint32 *SelectedBinaryInd) { + + // TODO dummy implementation. + // Real implementaion will use the same mechanism OpenCL ICD dispatcher + // uses. Somthing like: + // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_INVALID_CONTEXT); + // return context->dispatch->piextDeviceSelectIR( + // ctx, images, num_images, selected_image); + // where context->dispatch is set to the dispatch table provided by PI + // plugin for platform/device the ctx was created for. + + constexpr pi_uint32 InvalidInd = std::numeric_limits::max(); + *SelectedBinaryInd = NumBinaries > 0 ? 0 : InvalidInd; + return PI_SUCCESS; +} + +pi_result piextDeviceGetNativeHandle(pi_device Device, + pi_native_handle *NativeHandle) { + assert(Device); + assert(NativeHandle); + + auto ZeDevice = pi_cast(NativeHandle); + // Extract the L0 module handle from the given PI device + *ZeDevice = Device->ZeDevice; + return PI_SUCCESS; +} + +pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_device *Device) { + // Create PI device from the given L0 device handle. + die("piextDeviceCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piContextCreate(const pi_context_properties *Properties, + pi_uint32 NumDevices, const pi_device *Devices, + void (*PFnNotify)(const char *ErrInfo, + const void *PrivateInfo, size_t CB, + void *UserData), + void *UserData, pi_context *RetContext) { + + // L0 does not have notion of contexts. + // Return the device handle (only single device is allowed) as a context + // handle. + // + if (NumDevices != 1) { + zePrint("piCreateContext: context should have exactly one Device\n"); + return PI_INVALID_VALUE; + } + + assert(Devices); + assert(RetContext); + + *RetContext = new _pi_context(*Devices); + return PI_SUCCESS; +} + +pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Context); + + if (ParamName == PI_CONTEXT_INFO_DEVICES) { + SET_PARAM_VALUE(Context->Device); + } else if (ParamName == PI_CONTEXT_INFO_NUM_DEVICES) { + SET_PARAM_VALUE(pi_uint32{1}); + } else if (ParamName == PI_CONTEXT_INFO_REFERENCE_COUNT) { + SET_PARAM_VALUE(pi_uint32{Context->RefCount}); + } else { + // TODO: implement other parameters + die("piGetContextInfo: unsuppported ParamName."); + } + + return PI_SUCCESS; +} + +// FIXME: Dummy implementation to prevent link fail +pi_result piextContextSetExtendedDeleter(pi_context Context, + pi_context_extended_deleter Function, + void *UserData) { + die("piextContextSetExtendedDeleter: not supported"); + return PI_SUCCESS; +} + +pi_result piextContextGetNativeHandle(pi_context Context, + pi_native_handle *NativeHandle) { + die("piextContextGetNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piextContextCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context *Context) { + die("piextContextCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piContextRetain(pi_context Context) { + + assert(Context); + ++(Context->RefCount); + return PI_SUCCESS; +} + +pi_result piContextRelease(pi_context Context) { + + assert(Context); + if (--(Context->RefCount) == 0) { + delete Context; + } + return PI_SUCCESS; +} + +pi_result piQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties Properties, pi_queue *Queue) { + + // Check that unexpected bits are not set. + assert(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | + PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE | + PI_QUEUE_ON_DEVICE_DEFAULT))); + + ze_device_handle_t ZeDevice; + ze_command_queue_handle_t ZeCommandQueue; + + if (!Context) { + return PI_INVALID_CONTEXT; + } + if (Context->Device != Device) { + return PI_INVALID_DEVICE; + } + + assert(Device); + ZeDevice = Device->ZeDevice; + ze_command_queue_desc_t ZeCommandQueueDesc = {}; + ZeCommandQueueDesc.version = ZE_COMMAND_QUEUE_DESC_VERSION_CURRENT; + ZeCommandQueueDesc.ordinal = 0; + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + + ZE_CALL( + zeCommandQueueCreate(ZeDevice, + &ZeCommandQueueDesc, // TODO: translate properties + &ZeCommandQueue)); + + assert(Queue); + *Queue = new _pi_queue(ZeCommandQueue, Context); + return PI_SUCCESS; +} + +pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Queue); + + // TODO: consider support for queue properties and size + switch (ParamName) { + case PI_QUEUE_INFO_CONTEXT: + SET_PARAM_VALUE(Queue->Context); + break; + case PI_QUEUE_INFO_DEVICE: + SET_PARAM_VALUE(Queue->Context->Device); + break; + case PI_QUEUE_INFO_REFERENCE_COUNT: + SET_PARAM_VALUE(pi_uint32{Queue->RefCount}); + break; + case PI_QUEUE_INFO_PROPERTIES: + die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n"); + break; + case PI_QUEUE_INFO_SIZE: + die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n"); + break; + case PI_QUEUE_INFO_DEVICE_DEFAULT: + die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n"); + break; + default: + zePrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piQueueRetain(pi_queue Queue) { + ++(Queue->RefCount); + return PI_SUCCESS; +} + +pi_result piQueueRelease(pi_queue Queue) { + assert(Queue); + if (--(Queue->RefCount) == 0) { + ZE_CALL(zeCommandQueueDestroy(Queue->ZeCommandQueue)); + } + return PI_SUCCESS; +} + +pi_result piQueueFinish(pi_queue Queue) { + // Wait until command lists attached to the command queue are executed. + assert(Queue); + ZE_CALL(zeCommandQueueSynchronize(Queue->ZeCommandQueue, UINT32_MAX)); + return PI_SUCCESS; +} + +pi_result piextQueueGetNativeHandle(pi_queue Queue, + pi_native_handle *NativeHandle) { + die("piextQueueGetNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_queue *Queue) { + die("piextQueueCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, + void *HostPtr, pi_mem *RetMem) { + + // TODO: implement read-only, write-only + assert((Flags & PI_MEM_FLAGS_ACCESS_RW) != 0); + assert(Context); + assert(RetMem); + + void *Ptr; + ze_device_handle_t ZeDevice = Context->Device->ZeDevice; + + ze_device_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_DEFAULT; + ZeDesc.ordinal = 0; + ZE_CALL(zeDriverAllocDeviceMem(Context->Device->Platform->ZeDriver, &ZeDesc, + Size, + 1, // TODO: alignment + ZeDevice, &Ptr)); + + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { + // Initialize the buffer synchronously with immediate offload + ZE_CALL(zeCommandListAppendMemoryCopy(Context->Device->ZeCommandListInit, + Ptr, HostPtr, Size, nullptr)); + } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) { + // Nothing more to do. + } else { + die("piMemBufferCreate: not implemented"); + } + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; + *RetMem = new _pi_buffer(Context->Device->Platform, + pi_cast(Ptr) /* L0 Memory Handle */, + HostPtrOrNull); + + return PI_SUCCESS; +} + +pi_result piMemGetInfo(pi_mem Mem, + cl_mem_info ParamName, // TODO: untie from OpenCL + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + die("piMemGetInfo: not implemented"); + return {}; +} + +pi_result piMemRetain(pi_mem Mem) { + assert(Mem); + ++(Mem->RefCount); + return PI_SUCCESS; +} + +pi_result piMemRelease(pi_mem Mem) { + assert(Mem); + if (--(Mem->RefCount) == 0) { + if (Mem->isImage()) { + ZE_CALL(zeImageDestroy(pi_cast(Mem->getZeHandle()))); + } else { + auto Buf = static_cast<_pi_buffer *>(Mem); + if (!Buf->isSubBuffer()) { + ZE_CALL(zeDriverFreeMem(Mem->Platform->ZeDriver, Mem->getZeHandle())); + } + } + delete Mem; + } + return PI_SUCCESS; +} + +pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, + const pi_image_format *ImageFormat, + const pi_image_desc *ImageDesc, void *HostPtr, + pi_mem *RetImage) { + + // TODO: implement read-only, write-only + assert((Flags & PI_MEM_FLAGS_ACCESS_RW) != 0); + assert(ImageFormat); + assert(Context); + assert(RetImage); + + ze_image_format_type_t ZeImageFormatType; + size_t ZeImageFormatTypeSize; + switch (ImageFormat->image_channel_data_type) { + case CL_FLOAT: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; + ZeImageFormatTypeSize = 32; + break; + case CL_HALF_FLOAT: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; + ZeImageFormatTypeSize = 16; + break; + case CL_UNSIGNED_INT32: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 32; + break; + case CL_UNSIGNED_INT16: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 16; + break; + case CL_UNSIGNED_INT8: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 8; + break; + case CL_UNORM_INT16: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; + ZeImageFormatTypeSize = 16; + break; + case CL_UNORM_INT8: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; + ZeImageFormatTypeSize = 8; + break; + case CL_SIGNED_INT32: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 32; + break; + case CL_SIGNED_INT16: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 16; + break; + case CL_SIGNED_INT8: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 8; + break; + case CL_SNORM_INT16: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; + ZeImageFormatTypeSize = 16; + break; + case CL_SNORM_INT8: + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; + ZeImageFormatTypeSize = 8; + break; + default: + zePrint("piMemImageCreate: unsupported image data type: data type = %d\n", + ImageFormat->image_channel_data_type); + return PI_INVALID_VALUE; + } + + // TODO: populate the layout mapping + ze_image_format_layout_t ZeImageFormatLayout; + switch (ImageFormat->image_channel_order) { + case CL_RGBA: + switch (ZeImageFormatTypeSize) { + case 8: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8; + break; + case 16: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16; + break; + case 32: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32; + break; + default: + zePrint("piMemImageCreate: unexpected data type Size\n"); + return PI_INVALID_VALUE; + } + break; + default: + zePrint("format layout = %d\n", ImageFormat->image_channel_order); + die("piMemImageCreate: unsupported image format layout\n"); + break; + } + + ze_image_format_desc_t ZeFormatDesc = { + ZeImageFormatLayout, ZeImageFormatType, + // TODO: are swizzles deducted from image_format->image_channel_order? + ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G, + ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A}; + + ze_image_type_t ZeImageType; + switch (ImageDesc->image_type) { + case PI_MEM_TYPE_IMAGE1D: + ZeImageType = ZE_IMAGE_TYPE_1D; + break; + case PI_MEM_TYPE_IMAGE2D: + ZeImageType = ZE_IMAGE_TYPE_2D; + break; + case PI_MEM_TYPE_IMAGE3D: + ZeImageType = ZE_IMAGE_TYPE_3D; + break; + case PI_MEM_TYPE_IMAGE1D_ARRAY: + ZeImageType = ZE_IMAGE_TYPE_1DARRAY; + break; + case PI_MEM_TYPE_IMAGE2D_ARRAY: + ZeImageType = ZE_IMAGE_TYPE_2DARRAY; + break; + default: + zePrint("piMemImageCreate: unsupported image type\n"); + return PI_INVALID_VALUE; + } + + ze_image_desc_t ZeImageDesc = { + ZE_IMAGE_DESC_VERSION_CURRENT, + pi_cast(ZE_IMAGE_FLAG_PROGRAM_READ | + ZE_IMAGE_FLAG_PROGRAM_WRITE), + ZeImageType, + ZeFormatDesc, + pi_cast(ImageDesc->image_width), + pi_cast(ImageDesc->image_height), + pi_cast(ImageDesc->image_depth), + pi_cast(ImageDesc->image_array_size), + ImageDesc->num_mip_levels}; + + ze_image_handle_t ZeHImage; + ZE_CALL(zeImageCreate(Context->Device->ZeDevice, &ZeImageDesc, &ZeHImage)); + + auto HostPtrOrNull = + (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; + auto ZePIImage = + new _pi_image(Context->Device->Platform, ZeHImage, HostPtrOrNull); + +#ifndef NDEBUG + ZePIImage->ZeImageDesc = ZeImageDesc; +#endif // !NDEBUG + + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { + // Initialize image synchronously with immediate offload + ZE_CALL(zeCommandListAppendImageCopyFromMemory( + Context->Device->ZeCommandListInit, ZeHImage, HostPtr, nullptr, + nullptr)); + } + + *RetImage = ZePIImage; + return PI_SUCCESS; +} + +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { + die("piextMemGetNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_mem *Mem) { + die("piextMemCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piProgramCreate(pi_context Context, const void *IL, size_t Length, + pi_program *Program) { + + assert(Context); + assert(Program); + ze_device_handle_t ZeDevice = Context->Device->ZeDevice; + + ze_module_desc_t ZeModuleDesc = {}; + ZeModuleDesc.version = ZE_MODULE_DESC_VERSION_CURRENT; + ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; + ZeModuleDesc.inputSize = Length; + ZeModuleDesc.pInputModule = pi_cast(IL); + ZeModuleDesc.pBuildFlags = nullptr; + + ze_module_handle_t ZeModule; + ZE_CALL(zeModuleCreate(ZeDevice, &ZeModuleDesc, &ZeModule, + 0)); // TODO: handle build log + + auto ZePiProgram = new _pi_program(ZeModule, Context); + *Program = pi_cast(ZePiProgram); + return PI_SUCCESS; +} + +pi_result piclProgramCreateWithBinary(pi_context Context, pi_uint32 NumDevices, + const pi_device *DeviceList, + const size_t *Lengths, + const unsigned char **Binaries, + pi_int32 *BinaryStatus, + pi_program *RetProgram) { + + // This must be for the single device in this context. + assert(NumDevices == 1); + assert(Context); + assert(RetProgram); + assert(DeviceList && DeviceList[0] == Context->Device); + ze_device_handle_t ZeDevice = Context->Device->ZeDevice; + + // Check the binary too. + assert(Lengths && Lengths[0] != 0); + assert(Binaries && Binaries[0] != nullptr); + size_t Length = Lengths[0]; + auto Binary = pi_cast(Binaries[0]); + + ze_module_desc_t ZeModuleDesc = {}; + ZeModuleDesc.version = ZE_MODULE_DESC_VERSION_CURRENT; + ZeModuleDesc.format = ZE_MODULE_FORMAT_NATIVE; + ZeModuleDesc.inputSize = Length; + ZeModuleDesc.pInputModule = Binary; + ZeModuleDesc.pBuildFlags = nullptr; + + ze_module_handle_t ZeModule; + ZE_CALL(zeModuleCreate(ZeDevice, &ZeModuleDesc, &ZeModule, 0)); + + auto ZePiProgram = new _pi_program(ZeModule, Context); + *RetProgram = pi_cast(ZePiProgram); + + if (BinaryStatus) { + *BinaryStatus = PI_SUCCESS; + } + return PI_SUCCESS; +} + +pi_result piclProgramCreateWithSource(pi_context Context, pi_uint32 Count, + const char **Strings, + const size_t *Lengths, + pi_program *RetProgram) { + + zePrint("piclProgramCreateWithSource: not supported in L0\n"); + return PI_INVALID_OPERATION; +} + +pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Program); + switch (ParamName) { + case PI_PROGRAM_INFO_REFERENCE_COUNT: + SET_PARAM_VALUE(pi_uint32{Program->RefCount}); + break; + case PI_PROGRAM_INFO_NUM_DEVICES: + // L0 Module is always for a single device. + SET_PARAM_VALUE(pi_uint32{1}); + break; + case PI_PROGRAM_INFO_DEVICES: + SET_PARAM_VALUE(Program->Context->Device); + break; + case PI_PROGRAM_INFO_BINARY_SIZES: { + size_t SzBinary = 0; + ZE_CALL(zeModuleGetNativeBinary(Program->ZeModule, &SzBinary, nullptr)); + // This is an array of 1 element, initialize if it were scalar. + SET_PARAM_VALUE(size_t{SzBinary}); + break; + } + case PI_PROGRAM_INFO_BINARIES: { + size_t SzBinary = 0; + uint8_t **PBinary = pi_cast(ParamValue); + ZE_CALL(zeModuleGetNativeBinary(Program->ZeModule, &SzBinary, PBinary[0])); + break; + } + case PI_PROGRAM_INFO_NUM_KERNELS: { + uint32_t NumKernels = 0; + ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &NumKernels, nullptr)); + SET_PARAM_VALUE(size_t{NumKernels}); + break; + } + case PI_PROGRAM_INFO_KERNEL_NAMES: { + // There are extra allocations/copying here dictated by the difference + // in L0 and PI interfaces. + // + uint32_t Count = 0; + ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, nullptr)); + char **PNames = new char *[Count]; + ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, + const_cast(PNames))); + std::string PINames{""}; + for (uint32_t I = 0; I < Count; ++I) { + PINames += (I > 0 ? ";" : ""); + PINames += PNames[I]; + } + delete[] PNames; + SET_PARAM_VALUE_STR(PINames.c_str()); + break; + } + default: + die("piProgramGetInfo: not implemented"); + } + + return PI_SUCCESS; +} + +pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices, + const pi_device *DeviceList, const char *Options, + pi_uint32 NumInputPrograms, + const pi_program *InputPrograms, + void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData, pi_program *RetProgram) { + + // TODO: L0 builds the program at the time of piProgramCreate. + // But build options are not available at that time, so we must + // stop building it there, but move it here. The problem though + // is that this would mean moving zeModuleCreate here entirely, + // and so L0 module creation would be deferred until + // piProgramCompile/piProgramLink/piProgramBuild. + // + assert(NumInputPrograms == 1 && InputPrograms); + assert(RetProgram); + *RetProgram = InputPrograms[0]; + return PI_SUCCESS; +} + +pi_result piProgramCompile( + pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList, + const char *Options, pi_uint32 NumInputHeaders, + const pi_program *InputHeaders, const char **HeaderIncludeNames, + void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) { + + // TODO: L0 builds the program at the time of piProgramCreate. + // But build options are not available at that time, so we must + // stop building it there, but move it here. The problem though + // is that this would mean moving zeModuleCreate here entirely, + // and so L0 module creation would be deferred until + // piProgramCompile/piProgramLink/piProgramBuild. + // + return PI_SUCCESS; +} + +pi_result piProgramBuild(pi_program Program, pi_uint32 NumDevices, + const pi_device *DeviceList, const char *Options, + void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData) { + + // TODO: L0 builds the program at the time of piProgramCreate. + // But build options are not available at that time, so we must + // stop building it there, but move it here. The problem though + // is that this would mean moving zeModuleCreate here entirely, + // and so L0 module creation would be deferred until + // piProgramCompile/piProgramLink/piProgramBuild. + // + return PI_SUCCESS; +} + +pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, + cl_program_build_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + if (ParamName == CL_PROGRAM_BINARY_TYPE) { + // TODO: is this the only supported binary type in L0? + // We should probably return CL_PROGRAM_BINARY_TYPE_NONE if asked + // before the program was compiled. + // + SET_PARAM_VALUE(cl_program_binary_type{CL_PROGRAM_BINARY_TYPE_EXECUTABLE}); + } else if (ParamName == CL_PROGRAM_BUILD_OPTIONS) { + // TODO: how to get module build options out of L0? + // For the programs that we compiled we can remember the options + // passed with piProgramCompile/piProgramBuild, but what can we + // return for programs that were built outside and registered + // with piProgramRegister? + // + SET_PARAM_VALUE_STR(""); + } else { + zePrint("piProgramGetBuildInfo: unsupported ParamName\n"); + return PI_INVALID_VALUE; + } + return PI_SUCCESS; +} + +pi_result piProgramRetain(pi_program Program) { + assert(Program); + ++(Program->RefCount); + return PI_SUCCESS; +} + +pi_result piProgramRelease(pi_program Program) { + assert(Program); + if (--(Program->RefCount) == 0) { + // TODO: call zeModuleDestroy for non-interop L0 modules + delete Program; + } + return PI_SUCCESS; +} + +pi_result piextProgramGetNativeHandle(pi_program Program, + pi_native_handle *NativeHandle) { + die("piextProgramGetNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_program *Program) { + die("piextProgramCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piKernelCreate(pi_program Program, const char *KernelName, + pi_kernel *RetKernel) { + + assert(Program); + assert(RetKernel); + assert(KernelName); + ze_kernel_desc_t ZeKernelDesc = {}; + ZeKernelDesc.version = ZE_KERNEL_DESC_VERSION_CURRENT; + ZeKernelDesc.flags = ZE_KERNEL_FLAG_NONE; + ZeKernelDesc.pKernelName = KernelName; + + ze_kernel_handle_t ZeKernel; + ZE_CALL(zeKernelCreate(pi_cast(Program->ZeModule), + &ZeKernelDesc, &ZeKernel)); + + auto ZePiKernel = new _pi_kernel(ZeKernel, Program); + *RetKernel = pi_cast(ZePiKernel); + return PI_SUCCESS; +} + +pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize, + const void *ArgValue) { + + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value + // in which case a NULL value will be used as the value for the argument + // declared as a pointer to global or constant memory in the kernel" + // + // We don't know the type of the argument but it seems that the only time + // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument + // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. + // + if (ArgSize == sizeof(void *) && ArgValue && + *(void **)(const_cast(ArgValue)) == nullptr) { + ArgValue = nullptr; + } + + assert(Kernel); + ZE_CALL(zeKernelSetArgumentValue( + pi_cast(Kernel->ZeKernel), + pi_cast(ArgIndex), pi_cast(ArgSize), + pi_cast(ArgValue))); + + return PI_SUCCESS; +} + +// Special version of piKernelSetArg to accept pi_mem and pi_sampler. +pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, + const pi_mem *ArgValue) { + // TODO: the better way would probably be to add a new PI API for + // extracting native PI object from PI handle, and have SYCL + // RT pass that directly to the regular piKernelSetArg (and + // then remove this piextKernelSetArgMemObj). + // + + assert(Kernel); + ZE_CALL( + zeKernelSetArgumentValue(pi_cast(Kernel->ZeKernel), + pi_cast(ArgIndex), sizeof(void *), + (*ArgValue)->getZeHandlePtr())); + + return PI_SUCCESS; +} + +pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + assert(Kernel); + ze_kernel_properties_t ZeKernelProperties; + ZeKernelProperties.version = ZE_KERNEL_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeKernelGetProperties(Kernel->ZeKernel, &ZeKernelProperties)); + + switch (ParamName) { + case PI_KERNEL_INFO_CONTEXT: + SET_PARAM_VALUE(pi_context{Kernel->Program->Context}); + break; + case PI_KERNEL_INFO_PROGRAM: + SET_PARAM_VALUE(pi_program{Kernel->Program}); + break; + case PI_KERNEL_INFO_FUNCTION_NAME: + SET_PARAM_VALUE_STR(ZeKernelProperties.name); + break; + case PI_KERNEL_INFO_NUM_ARGS: + SET_PARAM_VALUE(pi_uint32{ZeKernelProperties.numKernelArgs}); + break; + case PI_KERNEL_INFO_REFERENCE_COUNT: + SET_PARAM_VALUE(pi_uint32{Kernel->RefCount}); + break; + case PI_KERNEL_INFO_ATTRIBUTES: { + uint32_t Size; + ZE_CALL(zeKernelGetAttribute( + Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, nullptr)); + char *attributes = new char[Size]; + ZE_CALL(zeKernelGetAttribute( + Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, attributes)); + SET_PARAM_VALUE_STR(attributes); + delete[] attributes; + break; + } + default: + zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, + pi_kernel_group_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + assert(Kernel); + assert(Device); + ze_device_handle_t ZeDevice = Device->ZeDevice; + ze_device_compute_properties_t ZeDeviceComputeProperties; + ZeDeviceComputeProperties.version = + ZE_DEVICE_COMPUTE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetComputeProperties(ZeDevice, &ZeDeviceComputeProperties)); + + ze_kernel_properties_t ZeKernelProperties; + ZeKernelProperties.version = ZE_KERNEL_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeKernelGetProperties(Kernel->ZeKernel, &ZeKernelProperties)); + + switch (ParamName) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + // TODO: To revisit after level_zero/issues/262 is resolved + struct { + size_t Arr[3]; + } WorkSize = {{ZeDeviceComputeProperties.maxGroupSizeX, + ZeDeviceComputeProperties.maxGroupSizeY, + ZeDeviceComputeProperties.maxGroupSizeZ}}; + SET_PARAM_VALUE(WorkSize); + break; + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + uint32_t X, Y, Z; + ZE_CALL(zeKernelSuggestGroupSize(Kernel->ZeKernel, 10000, 10000, 10000, &X, + &Y, &Z)); + SET_PARAM_VALUE(size_t{X * Y * Z}); + break; + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + struct { + size_t Arr[3]; + } WgSize = {{ZeKernelProperties.requiredGroupSizeX, + ZeKernelProperties.requiredGroupSizeY, + ZeKernelProperties.requiredGroupSizeZ}}; + SET_PARAM_VALUE(WgSize); + break; + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + // TODO: Assume 0 for now, replace with ze_kernel_properties_t::localMemSize + // once released in RT. + SET_PARAM_VALUE(pi_uint32{0}); + break; + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + ze_device_properties_t ZeDeviceProperties; + ZeDeviceProperties.version = ZE_DEVICE_PROPERTIES_VERSION_CURRENT; + ZE_CALL(zeDeviceGetProperties(ZeDevice, &ZeDeviceProperties)); + + SET_PARAM_VALUE(size_t{ZeDeviceProperties.physicalEUSimdWidth}); + break; + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: + // TODO: Assume 0 for now, replace with + // ze_kernel_properties_t::privateMemSize once released in RT. + SET_PARAM_VALUE(pi_uint32{0}); + break; + default: + zePrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return PI_INVALID_VALUE; + } + return PI_SUCCESS; +} + +pi_result piKernelGetSubGroupInfo( + pi_kernel Kernel, pi_device Device, + pi_kernel_sub_group_info ParamName, // TODO: untie from OpenCL + size_t InputValueSize, const void *InputValue, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + + die("piKernelGetSubGroupInfo: not implemented"); + return {}; +} + +pi_result piKernelRetain(pi_kernel Kernel) { + + assert(Kernel); + ++(Kernel->RefCount); + return PI_SUCCESS; +} + +pi_result piKernelRelease(pi_kernel Kernel) { + + assert(Kernel); + if (--(Kernel->RefCount) == 0) { + delete Kernel; + } + return PI_SUCCESS; +} + +pi_result +piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + assert(Kernel); + assert(Queue); + assert(WorkDim > 0); + assert(WorkDim < 4); + + ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]; + + // global_work_size of unused dimensions must be set to 1 + if (WorkDim < 3) { + assert(GlobalWorkSize[2] == 1); + } + if (WorkDim < 2) { + assert(GlobalWorkSize[1] == 1); + } + if (LocalWorkSize) { + WG[0] = pi_cast(LocalWorkSize[0]); + WG[1] = pi_cast(LocalWorkSize[1]); + WG[2] = pi_cast(LocalWorkSize[2]); + } else { + ZE_CALL(zeKernelSuggestGroupSize(Kernel->ZeKernel, GlobalWorkSize[0], + GlobalWorkSize[1], GlobalWorkSize[2], + &WG[0], &WG[1], &WG[2])); + } + + // TODO: assert if sizes do not fit into 32-bit? + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + pi_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + pi_cast(GlobalWorkSize[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + pi_cast(GlobalWorkSize[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + pi_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + pi_cast(GlobalWorkSize[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + pi_cast(GlobalWorkSize[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + zePrint("piEnqueueKernelLaunch: unsupported work_dim\n"); + return PI_INVALID_VALUE; + } + + assert(GlobalWorkSize[0] == (ZeThreadGroupDimensions.groupCountX * WG[0])); + assert(GlobalWorkSize[1] == (ZeThreadGroupDimensions.groupCountY * WG[1])); + assert(GlobalWorkSize[2] == (ZeThreadGroupDimensions.groupCountZ * WG[2])); + + ZE_CALL(zeKernelSetGroupSize(Kernel->ZeKernel, WG[0], WG[1], WG[2])); + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Kernel->Program->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + // Add the command to the command list + ZE_CALL(zeCommandListAppendLaunchKernel( + ZeCommandList, Kernel->ZeKernel, &ZeThreadGroupDimensions, ZeEvent, + NumEventsInWaitList, ZeEventWaitList)); + + zePrint("calling zeCommandListAppendLaunchKernel() with" + " ZeEvent %lx\n" + " NumEventsInWaitList %d:", + pi_cast(ZeEvent), NumEventsInWaitList); + for (pi_uint32 I = 0; I < NumEventsInWaitList; I++) { + zePrint(" %lx", pi_cast(ZeEventWaitList[I])); + } + zePrint("\n"); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + if (auto Res = Queue->executeCommandList(ZeCommandList)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +// +// Events +// +pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { + size_t Index = 0; + ze_event_pool_handle_t ZeEventPool = {}; + ZE_CALL(Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index)); + ze_event_handle_t ZeEvent; + ze_event_desc_t ZeEventDesc = {}; + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_NONE; + ZeEventDesc.wait = ZE_EVENT_SCOPE_FLAG_NONE; + ZeEventDesc.version = ZE_EVENT_DESC_VERSION_CURRENT; + ZeEventDesc.index = Index; + + ZE_CALL(zeEventCreate(ZeEventPool, &ZeEventDesc, &ZeEvent)); + + *RetEvent = + new _pi_event(ZeEvent, ZeEventPool, Context, PI_COMMAND_TYPE_USER); + return PI_SUCCESS; +} + +pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Event); + switch (ParamName) { + case PI_EVENT_INFO_COMMAND_QUEUE: + SET_PARAM_VALUE(pi_queue{Event->Queue}); + break; + case PI_EVENT_INFO_CONTEXT: + SET_PARAM_VALUE(pi_context{Event->Queue->Context}); + break; + case PI_EVENT_INFO_COMMAND_TYPE: + SET_PARAM_VALUE(pi_cast(Event->CommandType)); + break; + case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { + ze_result_t ZeResult; + ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus(Event->ZeEvent)); + if (ZeResult == ZE_RESULT_SUCCESS) { + SET_PARAM_VALUE(pi_int32{CL_COMPLETE}); // Untie from OpenCL + } else { + // TODO: We don't know if the status is queueed, submitted or running. + // For now return "running", as others are unlikely to be of + // interest. + SET_PARAM_VALUE(pi_int32{CL_RUNNING}); + } + break; + } + case PI_EVENT_INFO_REFERENCE_COUNT: + SET_PARAM_VALUE(pi_uint32{Event->RefCount}); + break; + default: + zePrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n", + ParamName, ParamName); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + assert(Event); + uint64_t ZeTimerResolution = + Event->Queue->Context->Device->ZeDeviceProperties.timerResolution; + + if (ParamName == PI_PROFILING_INFO_COMMAND_START) { + uint64_t ContextStart; + ZE_CALL(zeEventGetTimestamp( + Event->ZeEvent, ZE_EVENT_TIMESTAMP_CONTEXT_START, &ContextStart)); + ContextStart *= ZeTimerResolution; + SET_PARAM_VALUE(uint64_t{ContextStart}); + } else if (ParamName == PI_PROFILING_INFO_COMMAND_END) { + uint64_t ContextEnd; + ZE_CALL(zeEventGetTimestamp(Event->ZeEvent, ZE_EVENT_TIMESTAMP_CONTEXT_END, + &ContextEnd)); + ContextEnd *= ZeTimerResolution; + SET_PARAM_VALUE(uint64_t{ContextEnd}); + } else if (ParamName == PI_PROFILING_INFO_COMMAND_QUEUED || + ParamName == PI_PROFILING_INFO_COMMAND_SUBMIT) { + // TODO: Support these when L0 supported is added. + SET_PARAM_VALUE(uint64_t{0}); + } else { + zePrint("piEventGetProfilingInfo: not supported ParamName\n"); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { + ze_result_t ZeResult; + + if (NumEvents && !EventList) { + return PI_INVALID_EVENT; + } + + for (uint32_t I = 0; I < NumEvents; I++) { + ze_event_handle_t ZeEvent = EventList[I]->ZeEvent; + zePrint("ZeEvent = %lx\n", pi_cast(ZeEvent)); + // TODO: Using UINT32_MAX for timeout should have the desired + // effect of waiting until the event is trigerred, but it seems that + // it is causing an OS crash, so use an interruptable loop for now. + // + do { + ZeResult = ZE_CALL_NOCHECK(zeEventHostSynchronize(ZeEvent, 100000)); + } while (ZeResult == ZE_RESULT_NOT_READY); + + // Check the result to be success. + ZE_CALL(ZeResult); + + // NOTE: we are destroying associated command lists here to free + // resources sooner in case RT is not calling piEventRelease soon enough. + // + if (EventList[I]->ZeCommandList) { + // Event has been signaled: Destroy the command list associated with the + // call that generated the event. + ZE_CALL(zeCommandListDestroy(EventList[I]->ZeCommandList)); + EventList[I]->ZeCommandList = nullptr; + } + } + return PI_SUCCESS; +} + +pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType, + void (*PFnNotify)(pi_event Event, + pi_int32 EventCommandStatus, + void *UserData), + void *UserData) { + + // Increment the pi_event's reference counter to avoid destroying the event + // before all callbacks are executed. + piEventRetain(Event); + + // TODO: Can we support CL_SUBMITTED and CL_RUNNING? + // + if (CommandExecCallbackType != CL_COMPLETE) { + zePrint("piEventSetCallback: unsupported callback type\n"); + return PI_INVALID_VALUE; + } + + // Execute the wait and callback trigger in a side thread to not + // block the main host thread. + // TODO: We should use a single thread to serve all callbacks. + // + std::thread WaitThread( + [](pi_event Event, pi_int32 CommandExecCallbackType, + void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus, + void *UserData), + void *UserData) { + // Implements the wait for the event to complete. + assert(CommandExecCallbackType == CL_COMPLETE); + assert(Event); + ze_result_t ZeResult; + do { + ZeResult = + ZE_CALL_NOCHECK(zeEventHostSynchronize(Event->ZeEvent, 10000)); + } while (ZeResult == ZE_RESULT_NOT_READY); + + // Call the callback. + PFnNotify(Event, CommandExecCallbackType, UserData); + piEventRelease(Event); + }, + Event, CommandExecCallbackType, PFnNotify, UserData); + + WaitThread.detach(); + return PI_SUCCESS; +} + +pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) { + if (ExecutionStatus != CL_COMPLETE) { + die("piEventSetStatus: not implemented"); + } + + assert(Event); + ze_result_t ZeResult; + ze_event_handle_t ZeEvent = Event->ZeEvent; + + ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus(ZeEvent)); + // It can be that the status is already what we need it to be. + if (ZeResult != ZE_RESULT_SUCCESS) { + ZE_CALL(zeEventHostSignal(ZeEvent)); + ZE_CALL(zeEventQueryStatus(ZeEvent)); // double check + } + return PI_SUCCESS; +} + +pi_result piEventRetain(pi_event Event) { + ++(Event->RefCount); + return PI_SUCCESS; +} + +pi_result piEventRelease(pi_event Event) { + assert(Event); + if (--(Event->RefCount) == 0) { + if (Event->ZeCommandList) { + // Destroy the command list associated with the call that generated + // the event. + // + ZE_CALL(zeCommandListDestroy(Event->ZeCommandList)); + Event->ZeCommandList = nullptr; + } + if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP && + Event->CommandData) { + // Free the memory allocated in the piEnqueueMemBufferMap. + ZE_CALL(zeDriverFreeMem(Event->Queue->Context->Device->Platform->ZeDriver, + Event->CommandData)); + Event->CommandData = nullptr; + } + ZE_CALL(zeEventDestroy(Event->ZeEvent)); + + auto Context = Event->Context; + ZE_CALL(Context->decrementAliveEventsInPool(Event->ZeEventPool)); + + delete Event; + } + return PI_SUCCESS; +} + +pi_result piextEventGetNativeHandle(pi_event Event, + pi_native_handle *NativeHandle) { + die("piextEventGetNativeHandle: not supported"); + return PI_SUCCESS; +} + +pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_event *Event) { + die("piextEventCreateWithNativeHandle: not supported"); + return PI_SUCCESS; +} + +// +// Sampler +// +pi_result piSamplerCreate(pi_context Context, + const pi_sampler_properties *SamplerProperties, + pi_sampler *RetSampler) { + + assert(Context); + assert(RetSampler); + + ze_device_handle_t ZeDevice = Context->Device->ZeDevice; + + ze_sampler_handle_t ZeSampler; + ze_sampler_desc_t ZeSamplerDesc = {}; + ZeSamplerDesc.version = ZE_SAMPLER_DESC_VERSION_CURRENT; + + // Set the default values for the ZeSamplerDesc. + ZeSamplerDesc.isNormalized = PI_TRUE; + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP; + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; + + // Update the values of the ZeSamplerDesc from the pi_sampler_properties list. + // Default values will be used if any of the following is true: + // a) SamplerProperties list is NULL + // b) SamplerProperties list is missing any properties + + if (SamplerProperties) { + const pi_sampler_properties *CurProperty = SamplerProperties; + + while (*CurProperty != 0) { + switch (*CurProperty) { + case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { + pi_bool CurValueBool = pi_cast(*(++CurProperty)); + + if (CurValueBool == PI_TRUE) + ZeSamplerDesc.isNormalized = PI_TRUE; + else if (CurValueBool == PI_FALSE) + ZeSamplerDesc.isNormalized = PI_FALSE; + else { + zePrint("piSamplerCreate: unsupported " + "PI_SAMPLER_NORMALIZED_COORDS value\n"); + return PI_INVALID_VALUE; + } + } break; + + case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: { + pi_sampler_addressing_mode CurValueAddressingMode = + pi_cast( + pi_cast(*(++CurProperty))); + + // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE + switch (CurValueAddressingMode) { + case PI_SAMPLER_ADDRESSING_MODE_NONE: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE; + break; + case PI_SAMPLER_ADDRESSING_MODE_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT; + break; + case PI_SAMPLER_ADDRESSING_MODE_CLAMP: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP; + break; + case PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + break; + case PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR; + break; + default: + zePrint("piSamplerCreate: unsupported PI_SAMPLER_ADDRESSING_MODE " + "value\n"); + zePrint("PI_SAMPLER_ADDRESSING_MODE=%d\n", CurValueAddressingMode); + return PI_INVALID_VALUE; + } + } break; + + case PI_SAMPLER_PROPERTIES_FILTER_MODE: { + pi_sampler_filter_mode CurValueFilterMode = + pi_cast( + pi_cast(*(++CurProperty))); + + if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; + else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR; + else { + zePrint("PI_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode); + zePrint( + "piSamplerCreate: unsupported PI_SAMPLER_FILTER_MODE value\n"); + return PI_INVALID_VALUE; + } + } break; + + default: + break; + } + CurProperty++; + } + } + + ZE_CALL(zeSamplerCreate(ZeDevice, + &ZeSamplerDesc, // TODO: translate properties + &ZeSampler)); + + *RetSampler = new _pi_sampler(ZeSampler); + return PI_SUCCESS; +} + +pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + die("piSamplerGetInfo: not implemented"); + return {}; +} + +pi_result piSamplerRetain(pi_sampler Sampler) { + assert(Sampler); + ++(Sampler->RefCount); + return PI_SUCCESS; +} + +pi_result piSamplerRelease(pi_sampler Sampler) { + assert(Sampler); + if (--(Sampler->RefCount) == 0) { + ZE_CALL(zeSamplerDestroy(Sampler->ZeSampler)); + delete Sampler; + } + return PI_SUCCESS; +} + +// +// Queue Commands +// +pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + die("piEnqueueEventsWait: not implemented"); + return {}; +} + +pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, + pi_bool BlockingRead, size_t Offset, + size_t Size, void *Dst, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + assert(Src); + return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_READ, Queue, Dst, + BlockingRead, Size, + pi_cast(Src->getZeHandle()) + Offset, + NumEventsInWaitList, EventWaitList, Event); +} + +pi_result piEnqueueMemBufferReadRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, + const size_t *BufferOffset, const size_t *HostOffset, const size_t *Region, + size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, + size_t HostSlicePitch, void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(Buffer); + return enqueueMemCopyRectHelper( + PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, Queue, Buffer->getZeHandle(), + static_cast(Ptr), BufferOffset, HostOffset, Region, + BufferRowPitch, HostRowPitch, BufferSlicePitch, HostSlicePitch, + BlockingRead, NumEventsInWaitList, EventWaitList, Event); +} + +// Shared by all memory read/write/copy PI interfaces. +static pi_result +enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, + pi_bool BlockingWrite, size_t Size, const void *Src, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(Queue); + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + ZE_CALL( + zeCommandListAppendMemoryCopy(ZeCommandList, Dst, Src, Size, ZeEvent)); + + if (auto Res = Queue->executeCommandList(ZeCommandList, BlockingWrite)) + return Res; + + zePrint("calling zeCommandListAppendMemoryCopy() with\n" + " xe_event %lx\n" + " NumEventsInWaitList %d:", + pi_cast(ZeEvent), NumEventsInWaitList); + for (pi_uint32 I = 0; I < NumEventsInWaitList; I++) { + zePrint(" %lx", pi_cast(ZeEventWaitList[I])); + } + zePrint("\n"); + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +// Shared by all memory read/write/copy rect PI interfaces. +static pi_result enqueueMemCopyRectHelper( + pi_command_type CommandType, pi_queue Queue, void *SrcBuffer, + void *DstBuffer, const size_t *SrcOrigin, const size_t *DstOrigin, + const size_t *Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { + + assert(Region); + assert(SrcOrigin); + assert(DstOrigin); + assert(Queue); + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + zePrint("calling zeCommandListAppendWaitOnEvents() with\n" + " NumEventsInWaitList %d:", + pi_cast(ZeEvent), NumEventsInWaitList); + for (pi_uint32 I = 0; I < NumEventsInWaitList; I++) { + zePrint(" %lx", pi_cast(ZeEventWaitList[I])); + } + zePrint("\n"); + + uint32_t SrcOriginX = pi_cast(SrcOrigin[0]); + uint32_t SrcOriginY = pi_cast(SrcOrigin[1]); + uint32_t SrcOriginZ = pi_cast(SrcOrigin[2]); + + uint32_t SrcPitch = SrcRowPitch; + if (SrcPitch == 0) + SrcPitch = pi_cast(Region[0]); + + if (SrcSlicePitch == 0) + SrcSlicePitch = pi_cast(Region[1]) * SrcPitch; + + uint32_t DstOriginX = pi_cast(DstOrigin[0]); + uint32_t DstOriginY = pi_cast(DstOrigin[1]); + uint32_t DstOriginZ = pi_cast(DstOrigin[2]); + + uint32_t DstPitch = DstRowPitch; + if (DstPitch == 0) + DstPitch = pi_cast(Region[0]); + + if (DstSlicePitch == 0) + DstSlicePitch = pi_cast(Region[1]) * DstPitch; + + uint32_t Width = pi_cast(Region[0]); + uint32_t Height = pi_cast(Region[1]); + uint32_t Depth = pi_cast(Region[2]); + + const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, + Width, Height, Depth}; + const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, + Width, Height, Depth}; + + ZE_CALL(zeCommandListAppendMemoryCopyRegion( + ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch, + SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr)); + + zePrint("calling zeCommandListAppendMemoryCopyRegion()\n"); + + ZE_CALL(zeCommandListAppendBarrier(ZeCommandList, ZeEvent, 0, nullptr)); + + zePrint("calling zeCommandListAppendBarrier() with Event %lx\n", + pi_cast(ZeEvent)); + + if (auto Res = Queue->executeCommandList(ZeCommandList, Blocking)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, + pi_bool BlockingWrite, size_t Offset, + size_t Size, const void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + assert(Buffer); + return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_WRITE, Queue, + pi_cast(Buffer->getZeHandle()) + + Offset, // dst + BlockingWrite, Size, + Ptr, // src + NumEventsInWaitList, EventWaitList, Event); +} + +pi_result piEnqueueMemBufferWriteRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, + const size_t *BufferOffset, const size_t *HostOffset, const size_t *Region, + size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, + size_t HostSlicePitch, const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(Buffer); + return enqueueMemCopyRectHelper( + PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, Queue, + const_cast(static_cast(Ptr)), Buffer->getZeHandle(), + HostOffset, BufferOffset, Region, HostRowPitch, BufferRowPitch, + HostSlicePitch, BufferSlicePitch, BlockingWrite, NumEventsInWaitList, + EventWaitList, Event); +} + +pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, + pi_mem DstBuffer, size_t SrcOffset, + size_t DstOffset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + assert(SrcBuffer); + assert(DstBuffer); + return enqueueMemCopyHelper( + PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, + pi_cast(DstBuffer->getZeHandle()) + DstOffset, + false, // blocking + Size, pi_cast(SrcBuffer->getZeHandle()) + SrcOffset, + NumEventsInWaitList, EventWaitList, Event); +} + +pi_result +piEnqueueMemBufferCopyRect(pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, + const size_t *SrcOrigin, const size_t *DstOrigin, + const size_t *Region, size_t SrcRowPitch, + size_t SrcSlicePitch, size_t DstRowPitch, + size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(SrcBuffer); + assert(DstBuffer); + return enqueueMemCopyRectHelper( + PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, Queue, SrcBuffer->getZeHandle(), + DstBuffer->getZeHandle(), SrcOrigin, DstOrigin, Region, SrcRowPitch, + DstRowPitch, SrcSlicePitch, DstSlicePitch, + false, // blocking + NumEventsInWaitList, EventWaitList, Event); +} + +static pi_result +enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, + const void *Pattern, size_t PatternSize, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(Queue); + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + piEventCreate(Queue->Context, Event); + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + // Pattern size must be a power of two + assert((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0)); + + ZE_CALL(zeCommandListAppendMemoryFill(ZeCommandList, Ptr, Pattern, + PatternSize, Size, ZeEvent)); + + zePrint("calling zeCommandListAppendMemoryFill() with\n" + " xe_event %lx\n" + " NumEventsInWaitList %d:", + pi_cast(ZeEvent), NumEventsInWaitList); + for (pi_uint32 I = 0; I < NumEventsInWaitList; I++) { + zePrint(" %lx", pi_cast(ZeEventWaitList[I])); + } + zePrint("\n"); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + if (auto Res = Queue->executeCommandList(ZeCommandList)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, + const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + assert(Buffer); + return enqueueMemFillHelper(PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue, + pi_cast(Buffer->getZeHandle()) + Offset, + Pattern, PatternSize, Size, NumEventsInWaitList, + EventWaitList, Event); +} + +pi_result +piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap, + cl_map_flags MapFlags, // TODO: untie from OpenCL + size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event, + void **RetMap) { + + // TODO: we don't implement read-only or write-only, always read-write. + // assert((map_flags & CL_MAP_READ) != 0); + // assert((map_flags & CL_MAP_WRITE) != 0); + assert(Queue); + assert(Buffer); + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_MAP; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + // TODO: L0 is missing the memory "mapping" capabilities, so we are left + // to doing new memory allocation and a copy (read). + // + // TODO: check if the input buffer is already allocated in shared + // memory and thus is accessible from the host as is. Can we get SYCL RT + // to predict/allocate in shared memory from the beginning? + // + if (Buffer->MapHostPtr) { + // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it. + // It is also better for performance. + // + // "If the buffer object is created with CL_MEM_USE_HOST_PTR set in + // mem_flags, the following will be true: + // - The host_ptr specified in clCreateBuffer is guaranteed to contain the + // latest bits in the region being mapped when the clEnqueueMapBuffer + // command has completed. + // - The pointer value returned by clEnqueueMapBuffer will be derived from + // the host_ptr specified when the buffer object is created." + // + *RetMap = Buffer->MapHostPtr + Offset; + } else { + ze_host_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = ZE_HOST_MEM_ALLOC_FLAG_DEFAULT; + ZE_CALL(zeDriverAllocHostMem(Queue->Context->Device->Platform->ZeDriver, + &ZeDesc, Size, + 1, // TODO: alignment + RetMap)); + } + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZE_CALL(zeCommandListAppendMemoryCopy( + ZeCommandList, *RetMap, pi_cast(Buffer->getZeHandle()) + Offset, + Size, ZeEvent)); + + if (auto Res = Queue->executeCommandList(ZeCommandList, BlockingMap)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return Buffer->addMapping(*RetMap, Offset, Size); +} + +pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + // TODO: handle the case when user does not care to follow the event + // of unmap completion. + // + assert(Event); + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_UNMAP; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + // TODO: L0 is missing the memory "mapping" capabilities, so we are left + // to doing copy (write back to the device). + // + // NOTE: Keep this in sync with the implementation of + // piEnqueueMemBufferMap/piEnqueueMemImageMap. + // + _pi_mem::Mapping MapInfo = {}; + if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo)) + return Res; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZE_CALL(zeCommandListAppendMemoryCopy( + ZeCommandList, pi_cast(MemObj->getZeHandle()) + MapInfo.Offset, + MappedPtr, MapInfo.Size, ZeEvent)); + + // NOTE: we still have to free the host memory allocated/returned by + // piEnqueueMemBufferMap, but can only do so after the above copy + // is completed. Instead of waiting for It here (blocking), we shall + // do so in piEventRelease called for the pi_event tracking the unmap. + (*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr; + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + if (auto Res = Queue->executeCommandList(ZeCommandList)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + die("piMemImageGetInfo: not implemented"); + return {}; +} + +static ze_image_region_t getImageRegionHelper(pi_mem Mem, const size_t *Origin, + const size_t *Region) { + + assert(Mem && Origin); +#ifndef NDEBUG + assert(Mem->isImage()); + auto Image = static_cast<_pi_image *>(Mem); + ze_image_desc_t ZeImageDesc = Image->ZeImageDesc; +#endif // !NDEBUG + + assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin[1] == 0 && + Origin[2] == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin[2] == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin[2] == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_3D)); + + uint32_t OriginX = pi_cast(Origin[0]); + uint32_t OriginY = pi_cast(Origin[1]); + uint32_t OriginZ = pi_cast(Origin[2]); + + assert(Region[0] && Region[1] && Region[2]); + assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region[1] == 1 && + Region[2] == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region[2] == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region[2] == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_3D)); + + uint32_t Width = pi_cast(Region[0]); + uint32_t Height = pi_cast(Region[1]); + uint32_t Depth = pi_cast(Region[2]); + + const ze_image_region_t ZeRegion = {OriginX, OriginY, OriginZ, + Width, Height, Depth}; + return ZeRegion; +} + +// Helper function to implement image read/write/copy. +static pi_result +enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, + const void *Src, // image or ptr + void *Dst, // image or ptr + pi_bool IsBlocking, const size_t *SrcOrigin, + const size_t *DstOrigin, const size_t *Region, + size_t RowPitch, size_t SlicePitch, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + assert(Queue); + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList, + ZeEventWaitList)); + + if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) { + pi_mem SrcMem = pi_cast(const_cast(Src)); + + const ze_image_region_t ZeSrcRegion = + getImageRegionHelper(SrcMem, SrcOrigin, Region); + + // TODO: L0 does not support row_pitch/slice_pitch for images yet. + // Check that SYCL RT did not want pitch larger than default. + // +#ifndef NDEBUG + assert(SrcMem->isImage()); + auto SrcImage = static_cast<_pi_image *>(SrcMem); + const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc; + assert(RowPitch == 0 || + // special case RGBA image pitch equal to region's width + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && + RowPitch == 4 * 4 * ZeSrcRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && + RowPitch == 4 * 2 * ZeSrcRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && + RowPitch == 4 * ZeSrcRegion.width)); + assert(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height); +#endif // !NDEBUG + + ZE_CALL(zeCommandListAppendImageCopyToMemory( + ZeCommandList, Dst, pi_cast(SrcMem->getZeHandle()), + &ZeSrcRegion, ZeEvent)); + } else if (CommandType == PI_COMMAND_TYPE_IMAGE_WRITE) { + pi_mem DstMem = pi_cast(Dst); + const ze_image_region_t ZeDstRegion = + getImageRegionHelper(DstMem, DstOrigin, Region); + + // TODO: L0 does not support row_pitch/slice_pitch for images yet. + // Check that SYCL RT did not want pitch larger than default. + // +#ifndef NDEBUG + assert(DstMem->isImage()); + auto DstImage = static_cast<_pi_image *>(DstMem); + const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc; + assert(RowPitch == 0 || + // special case RGBA image pitch equal to region's width + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && + RowPitch == 4 * 4 * ZeDstRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && + RowPitch == 4 * 2 * ZeDstRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && + RowPitch == 4 * ZeDstRegion.width)); + assert(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height); +#endif // !NDEBUG + + ZE_CALL(zeCommandListAppendImageCopyFromMemory( + ZeCommandList, pi_cast(DstMem->getZeHandle()), Src, + &ZeDstRegion, ZeEvent)); + } else if (CommandType == PI_COMMAND_TYPE_IMAGE_COPY) { + pi_mem SrcImage = pi_cast(const_cast(Src)); + pi_mem DstImage = pi_cast(Dst); + + const ze_image_region_t ZeSrcRegion = + getImageRegionHelper(SrcImage, SrcOrigin, Region); + const ze_image_region_t ZeDstRegion = + getImageRegionHelper(DstImage, DstOrigin, Region); + + ZE_CALL(zeCommandListAppendImageCopyRegion( + ZeCommandList, pi_cast(DstImage->getZeHandle()), + pi_cast(SrcImage->getZeHandle()), &ZeDstRegion, + &ZeSrcRegion, ZeEvent)); + } else { + zePrint("enqueueMemImageUpdate: unsupported image command type\n"); + return PI_INVALID_OPERATION; + } + + if (auto Res = Queue->executeCommandList(ZeCommandList, IsBlocking)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, + pi_bool BlockingRead, const size_t *Origin, + const size_t *Region, size_t RowPitch, + size_t SlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + return enqueueMemImageCommandHelper( + PI_COMMAND_TYPE_IMAGE_READ, Queue, + Image, // src + Ptr, // dst + BlockingRead, + Origin, // SrcOrigin + nullptr, // DstOrigin + Region, RowPitch, SlicePitch, NumEventsInWaitList, EventWaitList, Event); +} + +pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, + pi_bool BlockingWrite, const size_t *Origin, + const size_t *Region, size_t InputRowPitch, + size_t InputSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + return enqueueMemImageCommandHelper(PI_COMMAND_TYPE_IMAGE_WRITE, Queue, + Ptr, // src + Image, // dst + BlockingWrite, + nullptr, // SrcOrigin + Origin, // DstOrigin + Region, InputRowPitch, InputSlicePitch, + NumEventsInWaitList, EventWaitList, + Event); +} + +pi_result piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, + pi_mem DstImage, const size_t *SrcOrigin, + const size_t *DstOrigin, const size_t *Region, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + return enqueueMemImageCommandHelper( + PI_COMMAND_TYPE_IMAGE_COPY, Queue, SrcImage, DstImage, + false, // is_blocking + SrcOrigin, DstOrigin, Region, + 0, // row pitch + 0, // slice pitch + NumEventsInWaitList, EventWaitList, Event); +} + +pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, + const void *FillColor, const size_t *Origin, + const size_t *Region, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + die("piEnqueueMemImageFill: not implemented"); + return {}; +} + +pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, + pi_buffer_create_type BufferCreateType, + void *BufferCreateInfo, pi_mem *RetMem) { + + assert(Buffer && !Buffer->isImage()); + assert(Flags == PI_MEM_FLAGS_ACCESS_RW); + assert(BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION); + assert(!(static_cast<_pi_buffer *>(Buffer))->isSubBuffer() && + "Sub-buffer cannot be partitioned"); + assert(BufferCreateInfo); + assert(RetMem); + + auto Region = (pi_buffer_region)BufferCreateInfo; + assert(Region->size != 0u && "Invalid size"); + assert(Region->origin <= (Region->origin + Region->size) && "Overflow"); + *RetMem = new _pi_buffer( + Buffer->Platform, + pi_cast(Buffer->getZeHandle()) + + Region->origin /* L0 memory handle */, + nullptr /* Host pointer */, Buffer /* Parent buffer */, + Region->origin /* Sub-buffer origin */, Region->size /*Sub-buffer size*/); + + return PI_SUCCESS; +} + +pi_result piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *), + void *Args, size_t CbArgs, + pi_uint32 NumMemObjects, const pi_mem *MemList, + const void **ArgsMemLoc, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + + die("piEnqueueNativeKernel: not implemented"); + return {}; +} + +// TODO: Check if the function_pointer_ret type can be converted to void**. +pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, + const char *FunctionName, + pi_uint64 *FunctionPointerRet) { + assert(Program != nullptr); + ZE_CALL(zeModuleGetFunctionPointer( + Program->ZeModule, FunctionName, + reinterpret_cast(FunctionPointerRet))); + return PI_SUCCESS; +} + +pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, + pi_usm_mem_properties *Properties, size_t Size, + pi_uint32 Alignment) { + + assert(Context); + // Check that incorrect bits are not set in the properties. + assert(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS))); + + ze_host_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = ZE_HOST_MEM_ALLOC_FLAG_DEFAULT; + // TODO: translate PI properties to L0 flags + ZE_CALL(zeDriverAllocHostMem(Context->Device->Platform->ZeDriver, &ZeDesc, + Size, Alignment, ResultPtr)); + + return PI_SUCCESS; +} + +pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, size_t Size, + pi_uint32 Alignment) { + + assert(Context); + assert(Device); + // Check that incorrect bits are not set in the properties. + assert(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS))); + + // TODO: translate PI properties to L0 flags + ze_device_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_DEFAULT; + ZeDesc.ordinal = 0; + ZE_CALL(zeDriverAllocDeviceMem(Context->Device->Platform->ZeDriver, &ZeDesc, + Size, Alignment, Device->ZeDevice, ResultPtr)); + + return PI_SUCCESS; +} + +pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, size_t Size, + pi_uint32 Alignment) { + + assert(Context); + assert(Device); + // Check that incorrect bits are not set in the properties. + assert(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS))); + + // TODO: translate PI properties to L0 flags + ze_host_mem_alloc_desc_t ZeHostDesc = {}; + ZeHostDesc.flags = ZE_HOST_MEM_ALLOC_FLAG_DEFAULT; + ze_device_mem_alloc_desc_t ZeDevDesc = {}; + ZeDevDesc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_DEFAULT; + ZeDevDesc.ordinal = 0; + ZE_CALL(zeDriverAllocSharedMem(Context->Device->Platform->ZeDriver, + &ZeDevDesc, &ZeHostDesc, Size, Alignment, + Device->ZeDevice, ResultPtr)); + + return PI_SUCCESS; +} + +pi_result piextUSMFree(pi_context Context, void *Ptr) { + ZE_CALL(zeDriverFreeMem(Context->Device->Platform->ZeDriver, Ptr)); + return PI_SUCCESS; +} + +pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, + size_t ArgSize, const void *ArgValue) { + + return piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue); +} + +/// USM Memset API +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the ptr to memset +/// @param Value is value to set. It is interpreted as an 8-bit value and the +/// upper +/// 24 bits are ignored +/// @param Count is the size in bytes to memset +/// @param NumEventsInWaitlist is the number of events to wait on +/// @param EventsWaitlist is an array of events to wait on +/// @param Event is the event that represents this operation +pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, + size_t Count, pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + if (!Ptr) { + return PI_INVALID_VALUE; + } + + return enqueueMemFillHelper( + // TODO: do we need a new command type for USM memset? + PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue, Ptr, + &Value, // It will be interpreted as an 8-bit value, + 1, // which is indicated with this pattern_size==1 + Count, NumEventsInWaitlist, EventsWaitlist, Event); +} + +pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, + const void *SrcPtr, size_t Size, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + + if (!DstPtr) { + return PI_INVALID_VALUE; + } + + return enqueueMemCopyHelper( + // TODO: do we need a new command type for this? + PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, DstPtr, Blocking, Size, SrcPtr, + NumEventsInWaitlist, EventsWaitlist, Event); +} + +/// Hint to migrate memory to the device +/// +/// @param Queue is the queue to submit to +/// @param Ptr points to the memory to migrate +/// @param Size is the number of bytes to migrate +/// @param Flags is a bitfield used to specify memory migration options +/// @param NumEventsInWaitlist is the number of events to wait on +/// @param EventsWaitlist is an array of events to wait on +/// @param Event is the event that represents this operation +pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, + pi_usm_migration_flags Flags, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + assert(Queue); + assert(!(Flags & ~PI_USM_MIGRATION_TBD0)); + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + // TODO: do we need to create a unique command type for this? + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_USER; + (*Event)->ZeCommandList = ZeCommandList; + + ze_event_handle_t *ZeEventWaitList = + _pi_event::createZeEventList(NumEventsInWaitlist, EventsWaitlist); + + ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitlist, + ZeEventWaitList)); + + // TODO: figure out how to translate "flags" + ZE_CALL(zeCommandListAppendMemoryPrefetch(ZeCommandList, Ptr, Size)); + + // TODO: L0 does not have a completion "event" with the prefetch API, + // so manually add command to signal our event. + // + ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); + + if (auto Res = Queue->executeCommandList(ZeCommandList, false)) + return Res; + + _pi_event::deleteZeEventList(ZeEventWaitList); + + return PI_SUCCESS; +} + +/// USM memadvise API to govern behavior of automatic migration mechanisms +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the data to be advised +/// @param Length is the size in bytes of the meory to advise +/// @param Advice is device specific advice +/// @param Event is the event that represents this operation +/// +pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, + size_t Length, pi_mem_advice Advice, + pi_event *Event) { + assert(Queue); + ze_memory_advice_t ZeAdvice = {}; + switch (Advice) { + case PI_MEM_ADVICE_SET_READ_MOSTLY: + ZeAdvice = ZE_MEMORY_ADVICE_SET_READ_MOSTLY; + break; + case PI_MEM_ADVICE_CLEAR_READ_MOSTLY: + ZeAdvice = ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY; + break; + case PI_MEM_ADVICE_SET_PREFERRED_LOCATION: + ZeAdvice = ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION; + break; + case PI_MEM_ADVICE_CLEAR_PREFERRED_LOCATION: + ZeAdvice = ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION; + break; + case PI_MEM_ADVICE_SET_ACCESSED_BY: + ZeAdvice = ZE_MEMORY_ADVICE_SET_ACCESSED_BY; + break; + case PI_MEM_ADVICE_CLEAR_ACCESSED_BY: + ZeAdvice = ZE_MEMORY_ADVICE_CLEAR_ACCESSED_BY; + break; + case PI_MEM_ADVICE_SET_NON_ATOMIC_MOSTLY: + ZeAdvice = ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY; + break; + case PI_MEM_ADVICE_CLEAR_NON_ATOMIC_MOSTLY: + ZeAdvice = ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY; + break; + case PI_MEM_ADVICE_BIAS_CACHED: + ZeAdvice = ZE_MEMORY_ADVICE_BIAS_CACHED; + break; + case PI_MEM_ADVICE_BIAS_UNCACHED: + ZeAdvice = ZE_MEMORY_ADVICE_BIAS_UNCACHED; + break; + default: + zePrint("piextUSMEnqueueMemAdvise: unexpected memory advise\n"); + return PI_INVALID_VALUE; + } + + // Get a new command list to be used on this call + ze_command_list_handle_t ZeCommandList = nullptr; + if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) + return Res; + + // TODO: do we need to create a unique command type for this? + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_USER; + (*Event)->ZeCommandList = ZeCommandList; + + ZE_CALL(zeCommandListAppendMemAdvise( + ZeCommandList, Queue->Context->Device->ZeDevice, Ptr, Length, ZeAdvice)); + + // TODO: L0 does not have a completion "event" with the advise API, + // so manually add command to signal our event. + // + ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); + + Queue->executeCommandList(ZeCommandList, false); + return PI_SUCCESS; +} + +/// API to query information about USM allocated pointers +/// Valid Queries: +/// PI_MEM_ALLOC_TYPE returns host/device/shared pi_usm_type value +/// PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if +/// the queried pointer fell inside an allocation. +/// Result must fit in void * +/// PI_MEM_ALLOC_SIZE returns how big the queried pointer's +/// allocation is in bytes. Result is a size_t. +/// PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against +/// +/// @param Context is the pi_context +/// @param Ptr is the pointer to query +/// @param ParamName is the type of query to perform +/// @param ParamValueSize is the size of the result in bytes +/// @param ParamValue is the result +/// @param ParamValueRet is how many bytes were written +pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, + pi_mem_info ParamName, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + assert(Context); + ze_device_handle_t ZeDeviceHandle; + ze_memory_allocation_properties_t ZeMemoryAllocationProperties = { + ZE_MEMORY_ALLOCATION_PROPERTIES_VERSION_CURRENT}; + + ZE_CALL(zeDriverGetMemAllocProperties(Context->Device->Platform->ZeDriver, + Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + switch (ParamName) { + case PI_MEM_ALLOC_TYPE: { + pi_usm_type MemAllocaType; + switch (ZeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_UNKNOWN: + MemAllocaType = PI_MEM_TYPE_UNKNOWN; + break; + case ZE_MEMORY_TYPE_HOST: + MemAllocaType = PI_MEM_TYPE_HOST; + break; + case ZE_MEMORY_TYPE_DEVICE: + MemAllocaType = PI_MEM_TYPE_DEVICE; + break; + case ZE_MEMORY_TYPE_SHARED: + MemAllocaType = PI_MEM_TYPE_SHARED; + break; + default: + zePrint("piextUSMGetMemAllocInfo: unexpected usm memory type\n"); + return PI_INVALID_VALUE; + } + SET_PARAM_VALUE(MemAllocaType); + break; + } + case PI_MEM_ALLOC_DEVICE: { + // TODO: this wants pi_device, but we didn't remember it, and cannot + // deduct from the L0 device. + // + die("piextUSMGetMemAllocInfo: PI_MEM_ALLOC_DEVICE not implemented"); + break; + } + case PI_MEM_ALLOC_BASE_PTR: { + void *Base; + ZE_CALL(zeDriverGetMemAddressRange(Context->Device->Platform->ZeDriver, Ptr, + &Base, nullptr)); + SET_PARAM_VALUE(Base); + break; + } + case PI_MEM_ALLOC_SIZE: { + size_t Size; + ZE_CALL(zeDriverGetMemAddressRange(Context->Device->Platform->ZeDriver, Ptr, + nullptr, &Size)); + SET_PARAM_VALUE(Size); + break; + } + default: + zePrint("piextUSMGetMemAllocInfo: unsupported ParamName\n"); + return PI_INVALID_VALUE; + } + return PI_SUCCESS; +} + +pi_result piKernelSetExecInfo(pi_kernel Kernel, pi_kernel_exec_info ParamName, + size_t ParamValueSize, const void *ParamValue) { + assert(Kernel); + assert(ParamValue); + if (ParamName == PI_USM_INDIRECT_ACCESS && + *(static_cast(ParamValue)) == PI_TRUE) { + // The whole point for users really was to not need to know anything + // about the types of allocations kernel uses. So in DPC++ we always + // just set all 3 modes for each kernel. + // + bool ZeIndirectValue = true; + ZE_CALL(zeKernelSetAttribute(Kernel->ZeKernel, + ZE_KERNEL_ATTR_INDIRECT_SHARED_ACCESS, + sizeof(bool), &ZeIndirectValue)); + ZE_CALL(zeKernelSetAttribute(Kernel->ZeKernel, + ZE_KERNEL_ATTR_INDIRECT_DEVICE_ACCESS, + sizeof(bool), &ZeIndirectValue)); + ZE_CALL(zeKernelSetAttribute(Kernel->ZeKernel, + ZE_KERNEL_ATTR_INDIRECT_HOST_ACCESS, + sizeof(bool), &ZeIndirectValue)); + } else { + zePrint("piKernelSetExecInfo: unsupported ParamName\n"); + return PI_INVALID_VALUE; + } + + return PI_SUCCESS; +} + +pi_result piextProgramSetSpecializationConstant(pi_program Prog, + pi_uint32 SpecID, + size_t SpecSize, + const void *SpecValue) { + // TODO: implement + die("piextProgramSetSpecializationConstant: not implemented"); + return {}; +} + +pi_result piPluginInit(pi_plugin *PluginInit) { + assert(PluginInit); + // TODO: handle versioning/targets properly. + size_t PluginVersionSize = sizeof(PluginInit->PluginVersion); + assert(strlen(_PI_H_VERSION_STRING) < PluginVersionSize); + strncpy(PluginInit->PluginVersion, _PI_H_VERSION_STRING, PluginVersionSize); + +#define _PI_API(api) \ + (PluginInit->PiFunctionTable).api = (decltype(&::api))(&api); +#include + + return PI_SUCCESS; +} + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/sycl/plugins/Intel_level0/pi_level0.hpp b/sycl/plugins/Intel_level0/pi_level0.hpp new file mode 100755 index 0000000000000..f43c518492781 --- /dev/null +++ b/sycl/plugins/Intel_level0/pi_level0.hpp @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include + +#include + +template To pi_cast(From Value) { + // TODO: see if more sanity checks are possible. + assert(sizeof(From) == sizeof(To)); + return (To)(Value); +} + +template <> uint32_t pi_cast(uint64_t Value) { + // Cast value and check that we don't lose any information. + uint32_t CastedValue = (uint32_t)(Value); + assert((uint64_t)CastedValue == Value); + return CastedValue; +} + +// TODO: Currently die is defined in each plugin. Probably some +// common header file with utilities should be created. +[[noreturn]] void die(const char *Message) { + std::cerr << "die: " << Message << std::endl; + std::terminate(); +} + +// Define the types that are opaque in pi.h in a manner suitabale for L0 plugin + +struct _pi_platform { + _pi_platform(ze_driver_handle_t Driver) : ZeDriver{Driver} {} + + // L0 lacks the notion of a platform, but thert is a driver, which is a + // pretty good fit to keep here. + // + ze_driver_handle_t ZeDriver; + + // Cache versions info from zeDriverGetProperties. + std::string ZeDriverVersion; + std::string ZeDriverApiVersion; +}; + +struct _pi_device { + _pi_device(ze_device_handle_t Device, pi_platform Plt, + bool isSubDevice = false) + : ZeDevice{Device}, Platform{Plt}, ZeCommandListInit{nullptr}, + IsSubDevice{isSubDevice}, RefCount{1}, ZeDeviceProperties{}, + ZeDeviceComputeProperties{} { + // NOTE: one must additionally call initialize() to complete + // PI device creation. + } + + // Initialize the entire PI device. + pi_result initialize(); + + // L0 device handle. + ze_device_handle_t ZeDevice; + + // PI platform to which this device belongs. + pi_platform Platform; + + // Immediate L0 command list for this device, to be used for initializations. + // To be created as: + // - Immediate command list: So any command appended to it is immediately + // offloaded to the device. + // - Synchronous: So implicit synchronization is made inside the level-zero + // driver. + ze_command_list_handle_t ZeCommandListInit; + + // Indicates if this is a root-device or a sub-device. + // Technically this information can be queried from a device handle, but it + // seems better to just keep it here. + // + bool IsSubDevice; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; + + // Create a new command list for executing on this device. + // It's caller's responsibility to remember and destroy the created + // command list when no longer needed. + // + pi_result createCommandList(ze_command_list_handle_t *ze_command_list); + + // Cache of the immutable device properties. + ze_device_properties_t ZeDeviceProperties; + ze_device_compute_properties_t ZeDeviceComputeProperties; +}; + +struct _pi_context { + _pi_context(pi_device Device) + : Device{Device}, RefCount{1}, ZeEventPool{nullptr}, + NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {} + + // L0 does not have notion of contexts. + // Keep the device here (must be exactly one) to return it when PI context + // is queried for devices. + // + pi_device Device; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; + + // Get index of the free slot in the available pool. If there is no avialble + // pool then create new one. + ze_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, + size_t &); + + // If event is destroyed then decrement number of events living in the pool + // and destroy the pool if there are no alive events. + ze_result_t decrementAliveEventsInPool(ze_event_pool_handle_t pool); + +private: + // Following member variables are used to manage assignment of events + // to event pools. + // TODO: These variables may be moved to pi_device and pi_platform + // if appropriate + // Event pool to which events are being added to + ze_event_pool_handle_t ZeEventPool; + // This map will be used to determine if a pool is full or not + // by storing number of empty slots available in the pool + std::map NumEventsAvailableInEventPool; + // This map will be used to determine number of live events in the pool + // We use separate maps for number of event slots available in the pool + // number of events live in the pool live + // This will help when we try to make the code thread-safe + std::map NumEventsLiveInEventPool; + + // TODO: we'd like to create a thread safe map class instead of mutex + map, + // that must be carefully used together. + + // Mutex to control operations on NumEventsAvailableInEventPool map. + std::mutex NumEventsAvailableInEventPoolMutex; + + // Mutex to control operations on NumEventsLiveInEventPool. + std::mutex NumEventsLiveInEventPoolMutex; +}; + +struct _pi_queue { + _pi_queue(ze_command_queue_handle_t Queue, pi_context Context) + : ZeCommandQueue{Queue}, Context{Context}, RefCount{1} {} + + // L0 command queue handle. + ze_command_queue_handle_t ZeCommandQueue; + + // Keeps the PI context to which this queue belongs. + pi_context Context; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; + + // Attach a command list to this queue, close, and execute it. + // Note that this command list cannot be appended to after this. + // The "is_blocking" tells if the wait for completion is requested. + // + pi_result executeCommandList(ze_command_list_handle_t ZeCommandList, + bool is_blocking = false); +}; + +struct _pi_mem { + // Keeps the PI platform of this memory handle. + pi_platform Platform; + + // Keeps the host pointer where the buffer will be mapped to, + // if created with PI_MEM_FLAGS_HOST_PTR_USE (see + // piEnqueueMemBufferMap for details). + // + char *MapHostPtr; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; + + // Supplementary data to keep track of the mappings of this memory + // created with piEnqueueMemBufferMap and piEnqueueMemImageMap. + // + struct Mapping { + // The offset in the buffer giving the start of the mapped region. + size_t Offset; + // The size of the mapped region. + size_t Size; + }; + + virtual ~_pi_mem() = default; + + // Interface of the _pi_mem object. + virtual void *getZeHandle() = 0; + + virtual void *getZeHandlePtr() = 0; + + virtual bool isImage() const = 0; + + // Thread-safe methods to work with memory mappings + pi_result addMapping(void *MappedTo, size_t Size, size_t Offset); + pi_result removeMapping(void *MappedTo, Mapping &MapInfo); + +protected: + _pi_mem(pi_platform Plt, char *HostPtr) + : Platform{Plt}, MapHostPtr{HostPtr}, RefCount{1}, Mappings{} {} + +private: + // The key is the host pointer representing an active mapping. + // The value is the information needed to maintain/undo the mapping. + // + std::map Mappings; + + // TODO: we'd like to create a thread safe map class instead of mutex + map, + // that must be carefully used together. + // The mutex that is used for thread-safe work with Mappings. + std::mutex MappingsMutex; +}; + +struct _pi_buffer final : _pi_mem { + // Buffer/Sub-buffer constructor + _pi_buffer(pi_platform Plt, char *Mem, char *HostPtr, + _pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0) + : _pi_mem(Plt, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {} + + void *getZeHandle() override { return ZeMem; } + + void *getZeHandlePtr() override { return &ZeMem; } + + bool isImage() const override { return false; } + + bool isSubBuffer() const { return SubBuffer.Parent != nullptr; } + + // L0 memory handle is really just a naked pointer. + // It is just convenient to have it char * to simplify offset arithmetics. + // + char *ZeMem; + + struct { + _pi_mem *Parent; + size_t Origin; // only valid if Parent != nullptr + size_t Size; // only valid if Parent != nullptr + } SubBuffer; +}; + +struct _pi_image final : _pi_mem { + // Image constructor + _pi_image(pi_platform Plt, ze_image_handle_t Image, char *HostPtr) + : _pi_mem(Plt, HostPtr), ZeImage{Image} {} + + void *getZeHandle() override { return ZeImage; } + + void *getZeHandlePtr() override { return &ZeImage; } + + bool isImage() const override { return true; } + +#ifndef NDEBUG + // Keep the descriptor of the image (for debugging purposes) + ze_image_desc_t ZeImageDesc; +#endif // !NDEBUG + + // L0 image handle. + ze_image_handle_t ZeImage; +}; + +struct _pi_event { + _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool, + pi_context Context, pi_command_type CommandType) + : ZeEvent{ZeEvent}, ZeEventPool{ZeEventPool}, ZeCommandList{nullptr}, + CommandType{CommandType}, Context{Context}, + CommandData{nullptr}, RefCount{1} {} + + // L0 event handle. + ze_event_handle_t ZeEvent; + // L0 event pool handle. + ze_event_pool_handle_t ZeEventPool; + + // L0 command list where the command signaling this event was appended to. + // This is currently used to remember/destroy the command list after + // all commands in it are completed, i.e. this event signaled. + // + ze_command_list_handle_t ZeCommandList; + + // Keeps the command-queue and command associated with the event. + // These are NULL for the user events. + pi_queue Queue; + pi_command_type CommandType; + // Provide direct access to Context, instead of going via queue + // Not every PI event has a queue, and we need a handle to Context + // to get to event pool related information + pi_context Context; + + // Opaque data to hold any data needed for CommandType. + void *CommandData; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; + + // Methods for translating PI events list into L0 events list + static ze_event_handle_t *createZeEventList(pi_uint32, const pi_event *); + static void deleteZeEventList(ze_event_handle_t *); +}; + +struct _pi_program { + _pi_program(ze_module_handle_t Module, pi_context Context) + : ZeModule{Module}, Context{Context}, RefCount{1} {} + + // L0 module handle. + ze_module_handle_t ZeModule; + + // Keep the context of the program. + pi_context Context; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; +}; + +struct _pi_kernel { + _pi_kernel(ze_kernel_handle_t Kernel, pi_program Program) + : ZeKernel{Kernel}, Program{Program}, RefCount{1} {} + + // L0 function handle. + ze_kernel_handle_t ZeKernel; + + // Keep the program of the kernel. + pi_program Program; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; +}; + +struct _pi_sampler { + _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler}, RefCount{1} {} + + // L0 sampler handle. + // TODO: It is important that L0 handler is the first data member. Workaround + // in SYCL RT (in ExecCGCommand::enqueueImp()) relies on this. This comment + // should be removed when workaround in SYCL runtime will be removed. + ze_sampler_handle_t ZeSampler; + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; +}; + diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index b1510004be08f..1559f40e5ad86 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -118,8 +118,10 @@ template <> class SYCLConfig { return BackendPtr; const char *ValStr = BaseT::getRawValue(); - const std::array, 2> SyclBeMap = { - {{"PI_OPENCL", backend::opencl}, {"PI_CUDA", backend::cuda}}}; + const std::array, 3> SyclBeMap = { + {{"PI_OPENCL", backend::opencl}, + {"PI_LEVEL0", backend::level0}, + {"PI_CUDA", backend::cuda}}}; if (ValStr) { auto It = std::find_if( std::begin(SyclBeMap), std::end(SyclBeMap), @@ -128,7 +130,7 @@ template <> class SYCLConfig { }); if (It == SyclBeMap.end()) pi::die("Invalid backend. " - "Valid values are PI_OPENCL/PI_CUDA"); + "Valid values are PI_OPENCL/PI_LEVEL0/PI_CUDA"); static backend Backend = It->second; BackendPtr = &Backend; } diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 15cc848e273af..735d947572fa3 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -214,6 +214,8 @@ bool findPlugins(vector_class> &PluginNames) { // PluginNames.push_back(std::make_pair(OPENCL_PLUGIN_NAME, backend::opencl)); + PluginNames.push_back(std::make_pair(LEVEL0_PLUGIN_NAME, + backend::level0)); PluginNames.push_back( std::make_pair(CUDA_PLUGIN_NAME, backend::cuda)); return true; From b0fabecdba9e71ba28e59264fe1b5640e5351e87 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 19 May 2020 19:27:20 +0300 Subject: [PATCH 02/21] [SYCL] Don't use SYCL device libs when backend is L0 L0 plugin doesn't support piProgramCompile/piProgramLink commands, program is built during piProgramCreate. Signed-off-by: Artur Gainullin --- sycl/source/detail/program_manager/program_manager.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 2abf16754b670..58f282a070ac5 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -764,6 +764,14 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, LinkOpts = LinkOptions.c_str(); } + // L0 plugin doesn't support piProgramCompile/piProgramLink commands, program + // is built during piProgramCreate. + // TODO: remove this check as soon as piProgramCompile/piProgramLink will be + // implemented in L0 plugin. + if (Context->getPlugin().getBackend() == backend::level0) { + LinkDeviceLibs = false; + } + std::vector LinkPrograms; if (LinkDeviceLibs) { LinkPrograms = getDeviceLibPrograms(Context, Devices, CachedLibPrograms); From e59ed9a2569f70359bea28451bf7e01fe6bf72b5 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 00:06:04 +0300 Subject: [PATCH 03/21] Minor fixes * Fix clang-format issue * Fix using statement Signed-off-by: Artur Gainullin --- sycl/include/CL/sycl/detail/pi.h | 2 +- sycl/plugins/Intel_level0/pi_level0.hpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 73bf9cd9b6c11..1c3efb5db1550 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -699,7 +699,7 @@ struct pi_buffer_region_struct { size_t origin; size_t size; }; -using pi_buffer_region_struct *pi_buffer_region; +using pi_buffer_region = pi_buffer_region_struct *; // Offload binaries descriptor version supported by this library. static const uint16_t PI_DEVICE_BINARIES_VERSION = 1; diff --git a/sycl/plugins/Intel_level0/pi_level0.hpp b/sycl/plugins/Intel_level0/pi_level0.hpp index f43c518492781..e8f81c01467b4 100755 --- a/sycl/plugins/Intel_level0/pi_level0.hpp +++ b/sycl/plugins/Intel_level0/pi_level0.hpp @@ -343,4 +343,3 @@ struct _pi_sampler { // Must be atomic to prevent data race when incrementing/decrementing. std::atomic RefCount; }; - From a4b5c7cfa0c5483a365ad04c531d6f646e908e52 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 03:14:38 +0300 Subject: [PATCH 04/21] Fix build on Windows and export only pi* symbols in libpi_level0.so Align with changes made for OpenCL plugin in https://github.com/intel/llvm/pull/1638: * Fix build on Windows by defining __SYCL_BUILD_SYCL_DLL in cmake file * Export only pi* symbols in libpi_level0.so * Add test to check exported symbols Signed-off-by: Artur Gainullin --- sycl/plugins/Intel_level0/CMakeLists.txt | 20 +++++ sycl/test/abi/pi_level0_symbol_check.dump | 101 ++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 sycl/test/abi/pi_level0_symbol_check.dump diff --git a/sycl/plugins/Intel_level0/CMakeLists.txt b/sycl/plugins/Intel_level0/CMakeLists.txt index 1517bffb9f494..f060d6fa9e9e7 100755 --- a/sycl/plugins/Intel_level0/CMakeLists.txt +++ b/sycl/plugins/Intel_level0/CMakeLists.txt @@ -46,6 +46,26 @@ add_library(pi_level0 SHARED "${CMAKE_CURRENT_SOURCE_DIR}/pi_level0.hpp" ) +if (MSVC) + # by defining __SYCL_BUILD_SYCL_DLL, we can use __declspec(dllexport) + # which are individually tagged for all pi* symbols in pi.h + target_compile_definitions(pi_level0 PRIVATE __SYCL_BUILD_SYCL_DLL) +else() + # we set the visibility of all symbols 'hidden' by default. + # In pi.h file, we set exported symbols with visibility==default individually + target_compile_options(pi_level0 PUBLIC -fvisibility=hidden) + + # This script file is used to allow exporting pi* symbols only. + # All other symbols are regarded as local (hidden) + set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/../ld-version-script.txt") + + # Filter symbols based on the scope defined in the script file, + # and export pi* function symbols in the library. + target_link_libraries( pi_level0 + PRIVATE "-Wl,--version-script=${linker_script}" + ) +endif() + add_dependencies(pi_level0 l0-loader) add_dependencies(sycl-toolchain pi_level0) diff --git a/sycl/test/abi/pi_level0_symbol_check.dump b/sycl/test/abi/pi_level0_symbol_check.dump new file mode 100644 index 0000000000000..51326e1c41cc4 --- /dev/null +++ b/sycl/test/abi/pi_level0_symbol_check.dump @@ -0,0 +1,101 @@ +# RUN: env LLVM_BIN_PATH=%llvm_build_bin_dir python %sycl_tools_src_dir/abi_check.py --mode check_symbols --reference %s %sycl_libs_dir/libpi_level0.so +# REQUIRES: linux + +piDeviceGetInfo +piextContextGetNativeHandle +piEnqueueMemImageWrite +piEnqueueMemBufferWrite +piextUSMFree +piEnqueueNativeKernel +piProgramRetain +piProgramGetBuildInfo +piextUSMEnqueueMemAdvise +piKernelCreate +piKernelGetSubGroupInfo +piextUSMEnqueueMemset +piSamplerRetain +piEventRelease +piextDeviceCreateWithNativeHandle +piEventSetCallback +piSamplerRelease +piextMemCreateWithNativeHandle +piEventRetain +piProgramLink +piextUSMSharedAlloc +piContextCreate +piSamplerGetInfo +piEnqueueMemImageCopy +piextMemGetNativeHandle +piEnqueueMemBufferMap +piPluginInit +piextQueueCreateWithNativeHandle +piContextRelease +piextProgramCreateWithNativeHandle +piMemBufferCreate +piextUSMGetMemAllocInfo +piDevicesGet +piKernelRetain +piSamplerCreate +piEnqueueMemBufferRead +piPlatformGetInfo +piContextRetain +piextDeviceSelectBinary +piEnqueueMemImageFill +piDeviceRelease +piQueueFinish +piKernelRelease +piMemImageCreate +piProgramCompile +piMemGetInfo +piextProgramSetSpecializationConstant +piextQueueGetNativeHandle +piEnqueueMemImageRead +piextUSMEnqueueMemcpy +piProgramCreate +piextContextSetExtendedDeleter +piProgramBuild +piKernelSetExecInfo +piPlatformsGet +piEnqueueMemBufferFill +piMemRetain +piextUSMEnqueuePrefetch +piextKernelSetArgPointer +piEnqueueEventsWait +piEnqueueMemBufferCopy +piQueueGetInfo +piDevicePartition +piQueueRetain +piextDeviceGetNativeHandle +piEventGetInfo +piMemImageGetInfo +piContextGetInfo +piclProgramCreateWithSource +piextProgramGetNativeHandle +piEventGetProfilingInfo +piProgramGetInfo +piextGetDeviceFunctionPointer +piEnqueueMemUnmap +piextKernelSetArgMemObj +piQueueCreate +piEventCreate +piKernelGetInfo +piQueueRelease +piKernelSetArg +piEnqueueMemBufferCopyRect +piEnqueueKernelLaunch +piextContextCreateWithNativeHandle +piclProgramCreateWithBinary +piKernelGetGroupInfo +piextEventCreateWithNativeHandle +piEventsWait +piMemRelease +piProgramRelease +piDeviceRetain +piextUSMDeviceAlloc +piEventSetStatus +piextEventGetNativeHandle +piEnqueueMemBufferReadRect +piMemBufferPartition +piEnqueueMemBufferWriteRect +piextUSMHostAlloc + From 120ee9c404c7c260630bffa9bdd75d671320df17 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 03:29:31 +0300 Subject: [PATCH 05/21] Fix naming of env variables Signed-off-by: Artur Gainullin --- sycl/plugins/Intel_level0/pi_level0.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/Intel_level0/pi_level0.cpp b/sycl/plugins/Intel_level0/pi_level0.cpp index b84ea5a06d97b..e22cd8d932a3c 100755 --- a/sycl/plugins/Intel_level0/pi_level0.cpp +++ b/sycl/plugins/Intel_level0/pi_level0.cpp @@ -490,11 +490,11 @@ static void piSignalHandler(int SigNum) { pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { - static const char *DebugMode = std::getenv("ZeDebug"); + static const char *DebugMode = std::getenv("ZE_DEBUG"); if (DebugMode) ZeDebug = true; - static const char *SerializeMode = std::getenv("ZeSerialize"); + static const char *SerializeMode = std::getenv("ZE_SERIALIZE"); static const pi_uint32 SerializeModeValue = SerializeMode ? std::atoi(SerializeMode) : 0; ZeSerialize = SerializeModeValue; From 5ff2e4d446235e32496d95861ef511451269dbf8 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 03:55:29 +0300 Subject: [PATCH 06/21] Update documentation * Add link to the Level Zero specification * Revert changes in the section providing links to download low level runtimes. Links are supposed to be provided in buildbot/dependency.conf according to upcoming changes https://github.com/intel/llvm/pull/1699 Signed-off-by: Artur Gainullin --- sycl/doc/GetStartedGuide.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index e7ddd196aed80..5a94bc03f3c0f 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -165,14 +165,14 @@ To run DPC++ applications on OpenCL devices, OpenCL implementation(s) must be present in the system. To run DPC++ applications on Level Zero devices, Level Zero implementation(s) -must be present in the system. +must be present in the system. You can find the link to the Level Zero spec in +the following section [Find More](#find-more). Please, refer to [the Release Notes](../ReleaseNotes.md) for recommended Intel runtime versions. -To run DPC++ application on Intel `GPU` devices the OpenCL `GPU` runtime or the -Level Zero `GPU` runtime is needed. They can be downloaded from the following web -pages: +The `GPU` runtime that is needed to run DPC++ application on Intel `GPU` devices +can be downloaded from the following web pages: * Linux: [Intel® Graphics Compute Runtime for OpenCL™](https://github.com/intel/compute-runtime/releases) @@ -547,5 +547,7 @@ class CUDASelector : public cl::sycl::device_selector { [https://spec.oneapi.com/versions/latest/elements/dpcpp/source/index.html](https://spec.oneapi.com/versions/latest/elements/dpcpp/source/index.html) - SYCL\* 1.2.1 specification: [www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf](https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) +- oneAPI Level Zero specification: +[https://spec.oneapi.com/versions/latest/oneL0/index.html](https://spec.oneapi.com/versions/latest/oneL0/index.html) \*Other names and brands may be claimed as the property of others. From 097c5959eb775ee844b3b349b76ebd7c9081e0bf Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 20:10:39 +0300 Subject: [PATCH 07/21] Update dependency.conf with info about Level Zero Signed-off-by: Artur Gainullin --- buildbot/dependency.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/buildbot/dependency.conf b/buildbot/dependency.conf index 6a09b746f2019..0d34958f5a89a 100644 --- a/buildbot/dependency.conf +++ b/buildbot/dependency.conf @@ -3,8 +3,10 @@ ocl_cpu_rt_ver=2020.10.4.0.15 # https://github.com/intel/llvm/releases/download/2020-03/win-oclcpuexp-2020.10.4.0.15_rel.zip ocl_cpu_rt_ver_win=2020.10.4.0.15 +# Same GPU driver supports Level Zero and OpenCL: # https://github.com/intel/compute-runtime/releases/tag/20.12.16259 ocl_gpu_rt_ver=l0-20.12.16259 +# Same GPU driver supports Level Zero and OpenCL: # https://downloadcenter.intel.com/download/29557/Intel-Graphics-Windows-10-DCH-Drivers ocl_gpu_rt_ver_win=ci-neo-015900 intel_sycl_ver=build From 4e56b22dfc1972e71b53aaa6967d67b3609bd828 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 20:13:50 +0300 Subject: [PATCH 08/21] Fix directory name Signed-off-by: Artur Gainullin --- sycl/plugins/CMakeLists.txt | 2 +- sycl/plugins/{Intel_level0 => level_zero}/CMakeLists.txt | 0 sycl/plugins/{Intel_level0 => level_zero}/pi_level0.cpp | 0 sycl/plugins/{Intel_level0 => level_zero}/pi_level0.hpp | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename sycl/plugins/{Intel_level0 => level_zero}/CMakeLists.txt (100%) rename sycl/plugins/{Intel_level0 => level_zero}/pi_level0.cpp (100%) rename sycl/plugins/{Intel_level0 => level_zero}/pi_level0.hpp (100%) diff --git a/sycl/plugins/CMakeLists.txt b/sycl/plugins/CMakeLists.txt index b4aafc80eaa13..700e09d1a0c1d 100644 --- a/sycl/plugins/CMakeLists.txt +++ b/sycl/plugins/CMakeLists.txt @@ -5,4 +5,4 @@ if(SYCL_BUILD_PI_CUDA) endif() add_subdirectory(opencl) -add_subdirectory(Intel_level0) +add_subdirectory(level_zero) diff --git a/sycl/plugins/Intel_level0/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt similarity index 100% rename from sycl/plugins/Intel_level0/CMakeLists.txt rename to sycl/plugins/level_zero/CMakeLists.txt diff --git a/sycl/plugins/Intel_level0/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp similarity index 100% rename from sycl/plugins/Intel_level0/pi_level0.cpp rename to sycl/plugins/level_zero/pi_level0.cpp diff --git a/sycl/plugins/Intel_level0/pi_level0.hpp b/sycl/plugins/level_zero/pi_level0.hpp similarity index 100% rename from sycl/plugins/Intel_level0/pi_level0.hpp rename to sycl/plugins/level_zero/pi_level0.hpp From 8c655b8bc53acb3e84c0700b58ffb7ce14c690fb Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 11:53:20 -0700 Subject: [PATCH 09/21] Address review comments * Use macro in zeParseError function which is easier to maintain * Fix env variable naming * Remove implementation of piEventSetCallback and piEventSetStatus, make them deprecated. These functions should be removed from PI in the separate patch because they are not used in the SYCL RT. Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 203 ++++++-------------------- 1 file changed, 46 insertions(+), 157 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index e22cd8d932a3c..c47957cbd7a59 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -94,7 +94,7 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, // here Setting it to 256 gave best possible performance for several // benchmarks static const char *MaxNumEventsPerPoolEnv = - std::getenv("MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); static const pi_uint32 MaxNumEventsPerPool = (MaxNumEventsPerPoolEnv) ? std::atoi(MaxNumEventsPerPoolEnv) : 256; @@ -222,111 +222,50 @@ static pi_result enqueueMemCopyRectHelper( inline void zeParseError(ze_result_t ZeError, std::string &ErrorString) { switch (ZeError) { - case ZE_RESULT_SUCCESS: - ErrorString = "ZE_RESULT_SUCCESS"; - break; - case ZE_RESULT_NOT_READY: - ErrorString = "ZE_RESULT_NOT_READY"; - break; - case ZE_RESULT_ERROR_DEVICE_LOST: - ErrorString = "ZE_RESULT_ERROR_DEVICE_LOST"; - break; - case ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY: - ErrorString = "ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY"; - break; - case ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: - ErrorString = "ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; - break; - case ZE_RESULT_ERROR_MODULE_BUILD_FAILURE: - ErrorString = "ZE_RESULT_ERROR_MODULE_BUILD_FAILURE"; - break; - case ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS: - ErrorString = "ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS"; - break; - case ZE_RESULT_ERROR_NOT_AVAILABLE: - ErrorString = "ZE_RESULT_ERROR_NOT_AVAILABLE"; - break; - case ZE_RESULT_ERROR_UNINITIALIZED: - ErrorString = "ZE_RESULT_ERROR_UNINITIALIZED"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_VERSION: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_VERSION"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_FEATURE: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_FEATURE"; - break; - case ZE_RESULT_ERROR_INVALID_ARGUMENT: - ErrorString = "ZE_RESULT_ERROR_INVALID_ARGUMENT"; - break; - case ZE_RESULT_ERROR_INVALID_NULL_HANDLE: - ErrorString = "ZE_RESULT_ERROR_INVALID_NULL_HANDLE"; - break; - case ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE: - ErrorString = "ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; - break; - case ZE_RESULT_ERROR_INVALID_NULL_POINTER: - ErrorString = "ZE_RESULT_ERROR_INVALID_NULL_POINTER"; - break; - case ZE_RESULT_ERROR_INVALID_SIZE: - ErrorString = "ZE_RESULT_ERROR_INVALID_SIZE"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_SIZE: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_SIZE"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; - break; - case ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - ErrorString = "ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; - break; - case ZE_RESULT_ERROR_INVALID_ENUMERATION: - ErrorString = "ZE_RESULT_ERROR_INVALID_ENUMERATION"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - break; - case ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - ErrorString = "ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; - break; - case ZE_RESULT_ERROR_INVALID_NATIVE_BINARY: - ErrorString = "ZE_RESULT_ERROR_INVALID_NATIVE_BINARY"; - break; - case ZE_RESULT_ERROR_INVALID_GLOBAL_NAME: - ErrorString = "ZE_RESULT_ERROR_INVALID_GLOBAL_NAME"; - break; - case ZE_RESULT_ERROR_INVALID_KERNEL_NAME: - ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_NAME"; - break; - case ZE_RESULT_ERROR_INVALID_FUNCTION_NAME: - ErrorString = "ZE_RESULT_ERROR_INVALID_FUNCTION_NAME"; - break; - case ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: - ErrorString = "ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; - break; - case ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: - ErrorString = "ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; - break; - case ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: - ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; - break; - case ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: - ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; - break; - case ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: - ErrorString = "ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; - break; - case ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE: - ErrorString = "ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE"; - break; - case ZE_RESULT_ERROR_OVERLAPPING_REGIONS: - ErrorString = "ZE_RESULT_ERROR_OVERLAPPING_REGIONS"; - break; - case ZE_RESULT_ERROR_UNKNOWN: - ErrorString = "ZE_RESULT_ERROR_UNKNOWN"; - break; +#define ZE_ERRCASE(ERR) \ + case ERR: \ + ErrorString = "" #ERR; \ + break; + + ZE_ERRCASE(ZE_RESULT_SUCCESS) + ZE_ERRCASE(ZE_RESULT_NOT_READY) + ZE_ERRCASE(ZE_RESULT_ERROR_DEVICE_LOST) + ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY) + ZE_ERRCASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) + ZE_ERRCASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) + ZE_ERRCASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS) + ZE_ERRCASE(ZE_RESULT_ERROR_NOT_AVAILABLE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNINITIALIZED) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ARGUMENT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE) + ZE_ERRCASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_ENUMERATION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION) + ZE_ERRCASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE) + ZE_ERRCASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE) + ZE_ERRCASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS) + ZE_ERRCASE(ZE_RESULT_ERROR_UNKNOWN) + +#undef ZE_ERRCASE default: assert("Unexpected Error code"); - } + } // switch } ze_result_t ZeCall::check(ze_result_t ZeResult, const char *CallStr, @@ -371,7 +310,7 @@ _pi_device::createCommandList(ze_command_list_handle_t *ZeCommandList) { // Create the command list, because in L0 commands are added to // the command lists, and later are then added to the command queue. // - // TODO: Fugire out how to lower the overhead of creating a new list + // TODO: Figure out how to lower the overhead of creating a new list // for each PI command, if that appears to be important. // ze_command_list_desc_t ZeCommandListDesc = {}; @@ -2398,61 +2337,12 @@ pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType, pi_int32 EventCommandStatus, void *UserData), void *UserData) { - - // Increment the pi_event's reference counter to avoid destroying the event - // before all callbacks are executed. - piEventRetain(Event); - - // TODO: Can we support CL_SUBMITTED and CL_RUNNING? - // - if (CommandExecCallbackType != CL_COMPLETE) { - zePrint("piEventSetCallback: unsupported callback type\n"); - return PI_INVALID_VALUE; - } - - // Execute the wait and callback trigger in a side thread to not - // block the main host thread. - // TODO: We should use a single thread to serve all callbacks. - // - std::thread WaitThread( - [](pi_event Event, pi_int32 CommandExecCallbackType, - void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus, - void *UserData), - void *UserData) { - // Implements the wait for the event to complete. - assert(CommandExecCallbackType == CL_COMPLETE); - assert(Event); - ze_result_t ZeResult; - do { - ZeResult = - ZE_CALL_NOCHECK(zeEventHostSynchronize(Event->ZeEvent, 10000)); - } while (ZeResult == ZE_RESULT_NOT_READY); - - // Call the callback. - PFnNotify(Event, CommandExecCallbackType, UserData); - piEventRelease(Event); - }, - Event, CommandExecCallbackType, PFnNotify, UserData); - - WaitThread.detach(); + die("piEventSetCallback: deprecated, to be removed"); return PI_SUCCESS; } pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) { - if (ExecutionStatus != CL_COMPLETE) { - die("piEventSetStatus: not implemented"); - } - - assert(Event); - ze_result_t ZeResult; - ze_event_handle_t ZeEvent = Event->ZeEvent; - - ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus(ZeEvent)); - // It can be that the status is already what we need it to be. - if (ZeResult != ZE_RESULT_SUCCESS) { - ZE_CALL(zeEventHostSignal(ZeEvent)); - ZE_CALL(zeEventQueryStatus(ZeEvent)); // double check - } + die("piEventSetStatus: deprecated, to be removed"); return PI_SUCCESS; } @@ -2893,7 +2783,6 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, if (Res != PI_SUCCESS) return Res; - piEventCreate(Queue->Context, Event); (*Event)->Queue = Queue; (*Event)->CommandType = CommandType; (*Event)->ZeCommandList = ZeCommandList; From 0fd2f79d11ee6fd23329d0509422cc4e9c21c1b1 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 13:13:05 -0700 Subject: [PATCH 10/21] Handle bad_alloc and other errors which could happend during new() Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 322 +++++++++++++++++--------- 1 file changed, 213 insertions(+), 109 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index c47957cbd7a59..42869a7cb32bf 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -342,12 +342,16 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList, ze_event_handle_t *_pi_event::createZeEventList(pi_uint32 EventListLength, const pi_event *EventList) { - ze_event_handle_t *ZeEventList = new ze_event_handle_t[EventListLength]; + try { + ze_event_handle_t *ZeEventList = new ze_event_handle_t[EventListLength]; - for (pi_uint32 I = 0; I < EventListLength; I++) { - ZeEventList[I] = EventList[I]->ZeEvent; + for (pi_uint32 I = 0; I < EventListLength; I++) { + ZeEventList[I] = EventList[I]->ZeEvent; + } + return ZeEventList; + } catch (...) { + return nullptr; } - return ZeEventList; } void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { @@ -494,30 +498,36 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, assert(ZeDriverCount == 1); ZE_CALL(zeDriverGet(&ZeDriverCount, &ZeDriver)); - // TODO: figure out how/when to release this memory - *Platforms = new _pi_platform(ZeDriver); - - // Cache driver properties - ze_driver_properties_t ZeDriverProperties; - ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties)); - uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion; - // Intel Level-Zero GPU driver stores version as: - // | 31 - 24 | 23 - 16 | 15 - 0 | - // | Major | Minor | Build | - std::string VersionMajor = - std::to_string((ZeDriverVersion & 0xFF000000) >> 24); - std::string VersionMinor = - std::to_string((ZeDriverVersion & 0x00FF0000) >> 16); - std::string VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF); - Platforms[0]->ZeDriverVersion = VersionMajor + std::string(".") + - VersionMinor + std::string(".") + - VersionBuild; - - ze_api_version_t ZeApiVersion; - ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion)); - Platforms[0]->ZeDriverApiVersion = - std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + std::string(".") + - std::to_string(ZE_MINOR_VERSION(ZeApiVersion)); + try { + // TODO: figure out how/when to release this memory + *Platforms = new _pi_platform(ZeDriver); + + // Cache driver properties + ze_driver_properties_t ZeDriverProperties; + ZE_CALL(zeDriverGetProperties(ZeDriver, &ZeDriverProperties)); + uint32_t ZeDriverVersion = ZeDriverProperties.driverVersion; + // Intel Level-Zero GPU driver stores version as: + // | 31 - 24 | 23 - 16 | 15 - 0 | + // | Major | Minor | Build | + std::string VersionMajor = + std::to_string((ZeDriverVersion & 0xFF000000) >> 24); + std::string VersionMinor = + std::to_string((ZeDriverVersion & 0x00FF0000) >> 16); + std::string VersionBuild = std::to_string(ZeDriverVersion & 0x0000FFFF); + Platforms[0]->ZeDriverVersion = VersionMajor + std::string(".") + + VersionMinor + std::string(".") + + VersionBuild; + + ze_api_version_t ZeApiVersion; + ZE_CALL(zeDriverGetApiVersion(ZeDriver, &ZeApiVersion)); + Platforms[0]->ZeDriverApiVersion = + std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + std::string(".") + + std::to_string(ZE_MINOR_VERSION(ZeApiVersion)); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } } if (NumPlatforms) @@ -598,19 +608,25 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, if (NumDevices) *NumDevices = ZeDeviceCount; - // TODO: Delete array at teardown - ze_device_handle_t *ZeDevices = new ze_device_handle_t[ZeDeviceCount]; - ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices)); - - for (uint32_t I = 0; I < ZeDeviceCount; ++I) { - // TODO: add check for device type - if (I < NumEntries) { - Devices[I] = new _pi_device(ZeDevices[I], Platform); - pi_result Result = Devices[I]->initialize(); - if (Result != PI_SUCCESS) { - return Result; + try { + // TODO: Delete array at teardown + ze_device_handle_t *ZeDevices = new ze_device_handle_t[ZeDeviceCount]; + ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices)); + + for (uint32_t I = 0; I < ZeDeviceCount; ++I) { + // TODO: add check for device type + if (I < NumEntries) { + Devices[I] = new _pi_device(ZeDevices[I], Platform); + pi_result Result = Devices[I]->initialize(); + if (Result != PI_SUCCESS) { + return Result; + } } } + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; } return PI_SUCCESS; } @@ -652,8 +668,17 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, ZE_CALL(zeDeviceGetMemoryProperties(ZeDevice, &ZeAvailMemCount, nullptr)); // Confirm at least one memory is available in the device assert(ZeAvailMemCount > 0); - ze_device_memory_properties_t *ZeDeviceMemoryProperties = - new ze_device_memory_properties_t[ZeAvailMemCount](); + + ze_device_memory_properties_t *ZeDeviceMemoryProperties; + try { + ZeDeviceMemoryProperties = + new ze_device_memory_properties_t[ZeAvailMemCount](); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + for (uint32_t I = 0; I < ZeAvailMemCount; I++) { ZeDeviceMemoryProperties[I].version = ZE_DEVICE_MEMORY_PROPERTIES_VERSION_CURRENT; @@ -1188,20 +1213,26 @@ pi_result piDevicePartition(pi_device Device, return PI_SUCCESS; } - auto ZeSubdevices = new ze_device_handle_t[Count]; - ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, ZeSubdevices)); + try { + auto ZeSubdevices = new ze_device_handle_t[Count]; + ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, ZeSubdevices)); - // Wrap the L0 sub-devices into PI sub-devices, and write them out. - for (uint32_t I = 0; I < Count; ++I) { - OutDevices[I] = new _pi_device(ZeSubdevices[I], Device->Platform, - true /* isSubDevice */); - pi_result Result = OutDevices[I]->initialize(); - if (Result != PI_SUCCESS) { - delete[] ZeSubdevices; - return Result; + // Wrap the L0 sub-devices into PI sub-devices, and write them out. + for (uint32_t I = 0; I < Count; ++I) { + OutDevices[I] = new _pi_device(ZeSubdevices[I], Device->Platform, + true /* isSubDevice */); + pi_result Result = OutDevices[I]->initialize(); + if (Result != PI_SUCCESS) { + delete[] ZeSubdevices; + return Result; + } } + delete[] ZeSubdevices; + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; } - delete[] ZeSubdevices; return PI_SUCCESS; } @@ -1261,7 +1292,13 @@ pi_result piContextCreate(const pi_context_properties *Properties, assert(Devices); assert(RetContext); - *RetContext = new _pi_context(*Devices); + try { + *RetContext = new _pi_context(*Devices); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1352,7 +1389,13 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, &ZeCommandQueue)); assert(Queue); - *Queue = new _pi_queue(ZeCommandQueue, Context); + try { + *Queue = new _pi_queue(ZeCommandQueue, Context); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1455,9 +1498,15 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, auto HostPtrOrNull = (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; - *RetMem = new _pi_buffer(Context->Device->Platform, - pi_cast(Ptr) /* L0 Memory Handle */, - HostPtrOrNull); + try { + *RetMem = new _pi_buffer(Context->Device->Platform, + pi_cast(Ptr) /* L0 Memory Handle */, + HostPtrOrNull); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1630,22 +1679,29 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, auto HostPtrOrNull = (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast(HostPtr) : nullptr; - auto ZePIImage = - new _pi_image(Context->Device->Platform, ZeHImage, HostPtrOrNull); + + try { + auto ZePIImage = + new _pi_image(Context->Device->Platform, ZeHImage, HostPtrOrNull); #ifndef NDEBUG - ZePIImage->ZeImageDesc = ZeImageDesc; + ZePIImage->ZeImageDesc = ZeImageDesc; #endif // !NDEBUG - if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || - (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { - // Initialize image synchronously with immediate offload - ZE_CALL(zeCommandListAppendImageCopyFromMemory( - Context->Device->ZeCommandListInit, ZeHImage, HostPtr, nullptr, - nullptr)); - } + if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || + (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { + // Initialize image synchronously with immediate offload + ZE_CALL(zeCommandListAppendImageCopyFromMemory( + Context->Device->ZeCommandListInit, ZeHImage, HostPtr, nullptr, + nullptr)); + } - *RetImage = ZePIImage; + *RetImage = ZePIImage; + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1678,8 +1734,14 @@ pi_result piProgramCreate(pi_context Context, const void *IL, size_t Length, ZE_CALL(zeModuleCreate(ZeDevice, &ZeModuleDesc, &ZeModule, 0)); // TODO: handle build log - auto ZePiProgram = new _pi_program(ZeModule, Context); - *Program = pi_cast(ZePiProgram); + try { + auto ZePiProgram = new _pi_program(ZeModule, Context); + *Program = pi_cast(ZePiProgram); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1713,8 +1775,14 @@ pi_result piclProgramCreateWithBinary(pi_context Context, pi_uint32 NumDevices, ze_module_handle_t ZeModule; ZE_CALL(zeModuleCreate(ZeDevice, &ZeModuleDesc, &ZeModule, 0)); - auto ZePiProgram = new _pi_program(ZeModule, Context); - *RetProgram = pi_cast(ZePiProgram); + try { + auto ZePiProgram = new _pi_program(ZeModule, Context); + *RetProgram = pi_cast(ZePiProgram); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } if (BinaryStatus) { *BinaryStatus = PI_SUCCESS; @@ -1766,24 +1834,29 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, SET_PARAM_VALUE(size_t{NumKernels}); break; } - case PI_PROGRAM_INFO_KERNEL_NAMES: { - // There are extra allocations/copying here dictated by the difference - // in L0 and PI interfaces. - // - uint32_t Count = 0; - ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, nullptr)); - char **PNames = new char *[Count]; - ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, - const_cast(PNames))); - std::string PINames{""}; - for (uint32_t I = 0; I < Count; ++I) { - PINames += (I > 0 ? ";" : ""); - PINames += PNames[I]; + case PI_PROGRAM_INFO_KERNEL_NAMES: + try { + // There are extra allocations/copying here dictated by the difference + // in L0 and PI interfaces. + // + uint32_t Count = 0; + ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, nullptr)); + char **PNames = new char *[Count]; + ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, + const_cast(PNames))); + std::string PINames{""}; + for (uint32_t I = 0; I < Count; ++I) { + PINames += (I > 0 ? ";" : ""); + PINames += PNames[I]; + } + delete[] PNames; + SET_PARAM_VALUE_STR(PINames.c_str()); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; } - delete[] PNames; - SET_PARAM_VALUE_STR(PINames.c_str()); break; - } default: die("piProgramGetInfo: not implemented"); } @@ -1910,8 +1983,14 @@ pi_result piKernelCreate(pi_program Program, const char *KernelName, ZE_CALL(zeKernelCreate(pi_cast(Program->ZeModule), &ZeKernelDesc, &ZeKernel)); - auto ZePiKernel = new _pi_kernel(ZeKernel, Program); - *RetKernel = pi_cast(ZePiKernel); + try { + auto ZePiKernel = new _pi_kernel(ZeKernel, Program); + *RetKernel = pi_cast(ZePiKernel); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -1982,17 +2061,23 @@ pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, case PI_KERNEL_INFO_REFERENCE_COUNT: SET_PARAM_VALUE(pi_uint32{Kernel->RefCount}); break; - case PI_KERNEL_INFO_ATTRIBUTES: { - uint32_t Size; - ZE_CALL(zeKernelGetAttribute( - Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, nullptr)); - char *attributes = new char[Size]; - ZE_CALL(zeKernelGetAttribute( - Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, attributes)); - SET_PARAM_VALUE_STR(attributes); - delete[] attributes; + case PI_KERNEL_INFO_ATTRIBUTES: + try { + uint32_t Size; + ZE_CALL(zeKernelGetAttribute( + Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, nullptr)); + char *attributes = new char[Size]; + ZE_CALL(zeKernelGetAttribute(Kernel->ZeKernel, + ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, + attributes)); + SET_PARAM_VALUE_STR(attributes); + delete[] attributes; + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } break; - } default: zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n", ParamName, ParamName); @@ -2221,8 +2306,14 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { ZE_CALL(zeEventCreate(ZeEventPool, &ZeEventDesc, &ZeEvent)); - *RetEvent = - new _pi_event(ZeEvent, ZeEventPool, Context, PI_COMMAND_TYPE_USER); + try { + *RetEvent = + new _pi_event(ZeEvent, ZeEventPool, Context, PI_COMMAND_TYPE_USER); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -2493,7 +2584,13 @@ pi_result piSamplerCreate(pi_context Context, &ZeSamplerDesc, // TODO: translate properties &ZeSampler)); - *RetSampler = new _pi_sampler(ZeSampler); + try { + *RetSampler = new _pi_sampler(ZeSampler); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } @@ -3202,12 +3299,19 @@ pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, auto Region = (pi_buffer_region)BufferCreateInfo; assert(Region->size != 0u && "Invalid size"); assert(Region->origin <= (Region->origin + Region->size) && "Overflow"); - *RetMem = new _pi_buffer( - Buffer->Platform, - pi_cast(Buffer->getZeHandle()) + - Region->origin /* L0 memory handle */, - nullptr /* Host pointer */, Buffer /* Parent buffer */, - Region->origin /* Sub-buffer origin */, Region->size /*Sub-buffer size*/); + try { + *RetMem = + new _pi_buffer(Buffer->Platform, + pi_cast(Buffer->getZeHandle()) + + Region->origin /* L0 memory handle */, + nullptr /* Host pointer */, Buffer /* Parent buffer */, + Region->origin /* Sub-buffer origin */, + Region->size /*Sub-buffer size*/); + } catch (const std::bad_alloc &) { + return PI_OUT_OF_HOST_MEMORY; + } catch (...) { + return PI_ERROR_UNKNOWN; + } return PI_SUCCESS; } From e8722f54b97d439b567f81f6e1f37c962a2a0d4a Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 14:03:53 -0700 Subject: [PATCH 11/21] Address review comments * Added license header to pi_level0.cpp/pi_level0.hpp * use unordered_map instead of map where possible * Refactor _pi_mem::addMapping according to suggestion * Add sanity checks to piProgramCompile and piPrgoramBuild * Narrowed extern "C" to exported functions. * Other minor fixes Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 79 ++++++++++++++++----------- sycl/plugins/level_zero/pi_level0.hpp | 32 +++++++++-- 2 files changed, 76 insertions(+), 35 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 42869a7cb32bf..4c3f44d5ef71b 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -1,6 +1,21 @@ +//==---------- pi_level0.cpp - Level Zero Plugin +//-----------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/// \file pi_level0.cpp +/// Implementation of Level Zero Plugin. +/// +/// \ingroup sycl_pi_level0 + #include "pi_level0.hpp" #include #include +#include #include #include #include @@ -8,7 +23,7 @@ #include -// Controls L0 calls serialization to w/a of L0 driver being not MT ready. +// Controls L0 calls serialization to w/a L0 driver being not MT ready. // Recognized values (can be used as a bit mask): enum { ZeSerializeNone = @@ -17,7 +32,7 @@ enum { ZeSerializeBlock = 2, // blocking ZE calls, where supported (usually in enqueue commands) }; -pi_uint32 ZeSerialize = 0; +static pi_uint32 ZeSerialize = 0; // This class encapsulates actions taken along with a call to L0 API. class ZeCall { @@ -49,7 +64,7 @@ class ZeCall { std::mutex ZeCall::GlobalLock; // Controls L0 calls tracing in zePrint. -bool ZeDebug = false; +static bool ZeDebug = false; static void zePrint(const char *Format, ...) { if (ZeDebug) { @@ -65,12 +80,10 @@ static void zePrint(const char *Format, ...) { pi_result _pi_mem::addMapping(void *MappedTo, size_t Offset, size_t Size) { std::lock_guard Lock(MappingsMutex); - auto It = Mappings.find(MappedTo); - if (It != Mappings.end()) { + auto Res = Mappings.insert({MappedTo, {Offset, Size}}); + if (Res.second) { zePrint("piEnqueueMemBufferMap: duplicate mapping detected\n"); - return PI_INVALID_OPERATION; - } else { - Mappings.insert({MappedTo, {Offset, Size}}); + return PI_INVALID_VALUE; } return PI_SUCCESS; } @@ -91,8 +104,8 @@ ze_result_t _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, size_t &Index) { // Maximum number of events that can be present in an event ZePool is captured - // here Setting it to 256 gave best possible performance for several - // benchmarks + // here. Setting it to 256 gave best possible performance for several + // benchmarks. static const char *MaxNumEventsPerPoolEnv = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); static const pi_uint32 MaxNumEventsPerPool = @@ -146,14 +159,6 @@ _pi_context::decrementAliveEventsInPool(ze_event_pool_handle_t ZePool) { return ZE_RESULT_SUCCESS; } -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -#include -#include -#include - // Some opencl extensions we know are supported by all Level0 devices. #define ZE_SUPPORTED_EXTENSIONS \ "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups " \ @@ -358,9 +363,6 @@ void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { delete[] ZeEventList; } -// Forward declararitons -decltype(piEventCreate) piEventCreate; - // No generic lambdas in C++11, so use this convinence macro. // NOTE: to be used in API returning "ParamValue". // NOTE: memset is used to clear all bytes in the memory allocated by SYCL RT @@ -430,6 +432,11 @@ static void piSignalHandler(int SigNum) { #define __FINALLY() } #endif // _WIN32 +extern "C" { + +// Forward declararitons +decltype(piEventCreate) piEventCreate; + pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { @@ -1897,6 +1904,8 @@ pi_result piProgramCompile( // and so L0 module creation would be deferred until // piProgramCompile/piProgramLink/piProgramBuild. // + // It is expected that program was successfully built during piProgramCreate + assert(Program && Program->ZeModule); return PI_SUCCESS; } @@ -1912,6 +1921,8 @@ pi_result piProgramBuild(pi_program Program, pi_uint32 NumDevices, // and so L0 module creation would be deferred until // piProgramCompile/piProgramLink/piProgramBuild. // + // It is expected that program was successfully built during piProgramCreate + assert(Program && Program->ZeModule); return PI_SUCCESS; } @@ -2191,19 +2202,15 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const pi_event *EventWaitList, pi_event *Event) { assert(Kernel); assert(Queue); - assert(WorkDim > 0); - assert(WorkDim < 4); + assert((WorkDim > 0) && (WorkDim < 4)); ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]; // global_work_size of unused dimensions must be set to 1 - if (WorkDim < 3) { - assert(GlobalWorkSize[2] == 1); - } - if (WorkDim < 2) { - assert(GlobalWorkSize[1] == 1); - } + assert(WorkDim == 3 || GlobalWorkSize[2] == 1); + assert(WorkDim >= 2 || GlobalWorkSize[1] == 1); + if (LocalWorkSize) { WG[0] = pi_cast(LocalWorkSize[0]); WG[1] = pi_cast(LocalWorkSize[1]); @@ -2655,6 +2662,8 @@ pi_result piEnqueueMemBufferReadRect( BlockingRead, NumEventsInWaitList, EventWaitList, Event); } +} // extern "C" + // Shared by all memory read/write/copy PI interfaces. static pi_result enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, @@ -2797,6 +2806,8 @@ static pi_result enqueueMemCopyRectHelper( return PI_SUCCESS; } +extern "C" { + pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, size_t Offset, size_t Size, const void *Ptr, @@ -2864,6 +2875,8 @@ piEnqueueMemBufferCopyRect(pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, NumEventsInWaitList, EventWaitList, Event); } +} // extern "C" + static pi_result enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, @@ -2917,6 +2930,8 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, return PI_SUCCESS; } +extern "C" { + pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, const void *Pattern, size_t PatternSize, size_t Offset, size_t Size, @@ -3072,6 +3087,8 @@ pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, return {}; } +} // extern "C" + static ze_image_region_t getImageRegionHelper(pi_mem Mem, const size_t *Origin, const size_t *Region) { @@ -3220,6 +3237,8 @@ enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, return PI_SUCCESS; } +extern "C" { + pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead, const size_t *Origin, const size_t *Region, size_t RowPitch, @@ -3711,6 +3730,4 @@ pi_result piPluginInit(pi_plugin *PluginInit) { return PI_SUCCESS; } -#ifdef __cplusplus } // extern "C" -#endif // __cplusplus diff --git a/sycl/plugins/level_zero/pi_level0.hpp b/sycl/plugins/level_zero/pi_level0.hpp index e8f81c01467b4..5d469fb373dcb 100755 --- a/sycl/plugins/level_zero/pi_level0.hpp +++ b/sycl/plugins/level_zero/pi_level0.hpp @@ -1,9 +1,29 @@ +//===-- pi_level0.hpp - Level Zero Plugin -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/// \defgroup sycl_pi_level0 Level Zero Plugin +/// \ingroup sycl_pi + +/// \file pi_level0.hpp +/// Declarations for Level Zero Plugin. It is the interface between the +/// device-agnostic SYCL runtime layer and underlying Level Zero runtime. +/// +/// \ingroup sycl_pi_level0 + +#ifndef PI_LEVEL0_HPP +#define PI_LEVEL0_HPP + #include #include #include #include -#include #include +#include #include @@ -123,12 +143,14 @@ struct _pi_context { ze_event_pool_handle_t ZeEventPool; // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool - std::map NumEventsAvailableInEventPool; + std::unordered_map + NumEventsAvailableInEventPool; // This map will be used to determine number of live events in the pool // We use separate maps for number of event slots available in the pool // number of events live in the pool live // This will help when we try to make the code thread-safe - std::map NumEventsLiveInEventPool; + std::unordered_map + NumEventsLiveInEventPool; // TODO: we'd like to create a thread safe map class instead of mutex + map, // that must be carefully used together. @@ -207,7 +229,7 @@ struct _pi_mem { // The key is the host pointer representing an active mapping. // The value is the information needed to maintain/undo the mapping. // - std::map Mappings; + std::unordered_map Mappings; // TODO: we'd like to create a thread safe map class instead of mutex + map, // that must be carefully used together. @@ -343,3 +365,5 @@ struct _pi_sampler { // Must be atomic to prevent data race when incrementing/decrementing. std::atomic RefCount; }; + +#endif //PI_LEVEL0_HPP From 0a0aabf1c5b157e4520d2a12c140a4605cba3145 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 20 May 2020 15:49:07 -0700 Subject: [PATCH 12/21] Fix formatting problem Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 3 +-- sycl/plugins/level_zero/pi_level0.hpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 4c3f44d5ef71b..ac16f5d079c19 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -1,5 +1,4 @@ -//==---------- pi_level0.cpp - Level Zero Plugin -//-----------------------------------==// +//===----------- pi_level0.cpp - Level Zero Plugin--------------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/sycl/plugins/level_zero/pi_level0.hpp b/sycl/plugins/level_zero/pi_level0.hpp index 5d469fb373dcb..04e23359b9f19 100755 --- a/sycl/plugins/level_zero/pi_level0.hpp +++ b/sycl/plugins/level_zero/pi_level0.hpp @@ -1,4 +1,4 @@ -//===-- pi_level0.hpp - Level Zero Plugin -----------------------------------------===// +//===---------- pi_level0.hpp - Level Zero Plugin -------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -366,4 +366,4 @@ struct _pi_sampler { std::atomic RefCount; }; -#endif //PI_LEVEL0_HPP +#endif // PI_LEVEL0_HPP From 9db2a3b29fb460558286ff915034beb98714f9f3 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 21 May 2020 00:05:40 -0700 Subject: [PATCH 13/21] Address comments * Remove unnecessary empty commented lines. * Create _pi_object base class for pi classes to store common data like reference counter. * Replace ZE_SUPPORTED_EXTENSIONS macro with constexpr * Rename "checkThis" method to more informative "doCall" in ZeCall class and remove unnesseccary static method from this class. * Check that ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL is not zero * Use std::call_once to call zeInit only once and to guarantee that current SIGSEGV signal handling using signal() will be correct (which works only for one time signal emit). * Add comments to _pi_mem interface * Add missing dots in the comments with multiple sentences. * Other minor fixes according to suggestion. Signed-off-by: Artur Gainullin --- sycl/include/CL/sycl/backend_types.hpp | 2 +- sycl/plugins/level_zero/pi_level0.cpp | 116 ++++++++---------------- sycl/plugins/level_zero/pi_level0.hpp | 117 ++++++++++--------------- sycl/source/detail/pi.cpp | 10 +-- 4 files changed, 88 insertions(+), 157 deletions(-) diff --git a/sycl/include/CL/sycl/backend_types.hpp b/sycl/include/CL/sycl/backend_types.hpp index 9411cc982393c..5de7b87979043 100644 --- a/sycl/include/CL/sycl/backend_types.hpp +++ b/sycl/include/CL/sycl/backend_types.hpp @@ -13,7 +13,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -enum class backend { host, opencl, level0, cuda }; +enum class backend : char { host, opencl, level0, cuda }; template struct interop; diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index ac16f5d079c19..7f7997885aa3c 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -51,14 +51,9 @@ class ZeCall { } } - static ze_result_t check(ze_result_t ZeResult, const char *CallStr, - bool TraceError = true); - // The non-static version just calls static one. - ze_result_t checkThis(ze_result_t ZeResult, const char *CallStr, - bool TraceError = true) { - return ZeCall::check(ZeResult, CallStr, TraceError); - } + ze_result_t doCall(ze_result_t ZeResult, const char *CallStr, + bool TraceError = true); }; std::mutex ZeCall::GlobalLock; @@ -108,7 +103,13 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, static const char *MaxNumEventsPerPoolEnv = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); static const pi_uint32 MaxNumEventsPerPool = - (MaxNumEventsPerPoolEnv) ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + + if (MaxNumEventsPerPool == 0) { + zePrint("Zero size can't be specified in the " + "ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL\n"); + return ZE_RESULT_ERROR_INVALID_SIZE; + } Index = 0; // Create one event ZePool per MaxNumEventsPerPool events @@ -159,9 +160,9 @@ _pi_context::decrementAliveEventsInPool(ze_event_pool_handle_t ZePool) { } // Some opencl extensions we know are supported by all Level0 devices. -#define ZE_SUPPORTED_EXTENSIONS \ - "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups " \ - "cl_intel_subgroups_short cl_intel_required_subgroup_size " +constexpr char ZE_SUPPORTED_EXTENSIONS[] = + "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups " + "cl_intel_subgroups_short cl_intel_required_subgroup_size "; // Map L0 runtime error code to PI error code static pi_result mapError(ze_result_t ZeResult) { @@ -272,8 +273,8 @@ inline void zeParseError(ze_result_t ZeError, std::string &ErrorString) { } // switch } -ze_result_t ZeCall::check(ze_result_t ZeResult, const char *CallStr, - bool TraceError) { +ze_result_t ZeCall::doCall(ze_result_t ZeResult, const char *CallStr, + bool TraceError) { zePrint("ZE ---> %s\n", CallStr); if (ZeResult && TraceError) { @@ -285,9 +286,9 @@ ze_result_t ZeCall::check(ze_result_t ZeResult, const char *CallStr, } #define ZE_CALL(Call) \ - if (auto Result = ZeCall().checkThis(Call, #Call, true)) \ + if (auto Result = ZeCall().doCall(Call, #Call, true)) \ return mapError(Result); -#define ZE_CALL_NOCHECK(Call) ZeCall().checkThis(Call, #Call, false) +#define ZE_CALL_NOCHECK(Call) ZeCall().doCall(Call, #Call, false) pi_result _pi_device::initialize() { // Create the immediate command list to be used for initializations @@ -316,13 +317,11 @@ _pi_device::createCommandList(ze_command_list_handle_t *ZeCommandList) { // // TODO: Figure out how to lower the overhead of creating a new list // for each PI command, if that appears to be important. - // ze_command_list_desc_t ZeCommandListDesc = {}; ZeCommandListDesc.version = ZE_COMMAND_LIST_DESC_VERSION_CURRENT; // TODO: can we just reset the command-list created when an earlier // command was submitted to the queue? - // ZE_CALL(zeCommandListCreate(ZeDevice, &ZeCommandListDesc, ZeCommandList)); return PI_SUCCESS; @@ -401,7 +400,6 @@ void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { // Recover from Linux SIGSEGV signal. // We can't reliably catch C++ exceptions thrown from signal // handler so use setjmp/longjmp. -// #include #include jmp_buf ReturnHere; @@ -436,6 +434,8 @@ extern "C" { // Forward declararitons decltype(piEventCreate) piEventCreate; +std::once_flag OnceFlag; + pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { @@ -458,20 +458,20 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, ze_result_t ZeResult; // This is a good time to initialize L0. // We can still safely recover if something goes wrong during the init. - // - // NOTE: for some reason only first segfault is reliably handled, - // so remember it, and avoid calling zeInit again. - // - // TODO: we should not call zeInit multiples times ever, so - // this code should be changed. - // - static bool SegFault = false; __TRY() { - ZeResult = SegFault ? ZE_RESULT_ERROR_UNINITIALIZED - : ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); + // We should not call zeInit multiples times ever. + try { + std::call_once(OnceFlag, [&ZeResult]() { + ZeResult = ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); + }); + } catch (std::system_error &err) { + // if any condition prevents calls to call_once from executing as + // specified + ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; + } } __CATCH() { - SegFault = true; + // SegFault = true; zePrint("L0 raised segfault: assume no Platforms\n"); ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; } @@ -484,13 +484,13 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, return PI_SUCCESS; } - if (auto Res = ZeCall::check(ZeResult, "zeInit")) { - return mapError(Res); + if (ZeResult != ZE_RESULT_SUCCESS) { + zePrint("zeInit: Level Zero initialization failure\n"); + return mapError(ZeResult); } // L0 does not have concept of Platforms, but L0 driver is the // closest match. - // if (Platforms && NumEntries > 0) { uint32_t ZeDriverCount = 0; ZE_CALL(zeDriverGet(&ZeDriverCount, nullptr)); @@ -583,7 +583,6 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, // From OpenCL 2.1: "This version string has the following format: // OpenCL. Follow the same notation here. - // SET_PARAM_VALUE_STR(Platform->ZeDriverApiVersion.c_str()); break; default: @@ -620,7 +619,6 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, ZeDevices)); for (uint32_t I = 0; I < ZeDeviceCount; ++I) { - // TODO: add check for device type if (I < NumEntries) { Devices[I] = new _pi_device(ZeDevices[I], Platform); pi_result Result = Devices[I]->initialize(); @@ -652,7 +650,6 @@ pi_result piDeviceRelease(pi_device Device) { // TODO: OpenCL says root-device ref-count remains unchanged (1), // but when would we free the device's data? - // if (--(Device->RefCount) == 0) { // Destroy the command list used for initializations ZE_CALL(zeCommandListDestroy(Device->ZeCommandListInit)); @@ -691,7 +688,6 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, } // TODO: cache various device properties in the PI device object, // and initialize them only upon they are first requested. - // ZE_CALL(zeDeviceGetMemoryProperties(ZeDevice, &ZeAvailMemCount, ZeDeviceMemoryProperties)); @@ -736,17 +732,14 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // TODO: Use proper mechanism to get this information from Level0 after // it is added to Level0. // Hardcoding the few we know are supported by the current hardware. - // - // std::string SupportedExtensions; // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIRV support. Core // feature in >OpenCL 2.1 // cl_khr_subgroups - Extension adds support for implementation-controlled // subgroups. - // cl_intel_subgroups - Extension adds subgroup features, defined by - // Intel. cl_intel_subgroups_short - Extension adds subgroup functions - // described in + // cl_intel_subgroups - Extension adds subgroup features, defined by Intel. + // cl_intel_subgroups_short - Extension adds subgroup functions described in // the cl_intel_subgroups extension to support 16-bit integer data types // for performance. // cl_intel_required_subgroup_size - Extension to allow programmers to @@ -869,13 +862,11 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, SET_PARAM_VALUE(pi_uint32{Device->RefCount}); break; case PI_DEVICE_INFO_PARTITION_PROPERTIES: { - // // It is debatable if SYCL sub-device and partitioning APIs sufficient to // expose Level0 sub-devices? We start with support of // "partition_by_affinity_domain" and "numa" but if that doesn't seem to // be a good fit we could look at adding a more descriptive partitioning // type. - // struct { pi_device_partition_property Arr[2]; } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}}; @@ -962,7 +953,6 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // be for "alignment requirement (in bits) for sub-buffer offsets." // An OpenCL implementation returns 8*128, but L0 can do just 8, // meaning unaligned access for values of types larger than 8 bits. - // SET_PARAM_VALUE(pi_uint32{8}); break; case PI_DEVICE_INFO_MAX_SAMPLERS: @@ -1052,31 +1042,26 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - // SET_PARAM_VALUE(size_t{8192}); break; case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - // SET_PARAM_VALUE(size_t{8192}); break; case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - // SET_PARAM_VALUE(size_t{2048}); break; case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - // SET_PARAM_VALUE(size_t{2048}); break; case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - // SET_PARAM_VALUE(size_t{2048}); break; case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: @@ -1085,10 +1070,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: SET_PARAM_VALUE(size_t{ZeDeviceImageProperties.maxImageArraySlices}); break; - // // Handle SIMD widths. // TODO: can we do better than this? - // case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 1); @@ -1118,9 +1101,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); break; case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Max_num_sub_Groups = - // maxTotalGroupSize/min(set - // of subGroupSizes); + // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes); uint32_t MinSubGroupSize = Device->ZeDeviceComputeProperties.subGroupSizes[0]; for (uint32_t I = 1; I < Device->ZeDeviceComputeProperties.numSubGroupSizes; @@ -1169,8 +1150,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { pi_uint64 Supported = 0; if (Device->ZeDeviceProperties.unifiedMemorySupported) { - // TODO: Use - // ze_memory_access_capabilities_t + // TODO: Use ze_memory_access_capabilities_t Supported = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS; } @@ -1248,8 +1228,8 @@ piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context? pi_uint32 *SelectedBinaryInd) { // TODO dummy implementation. - // Real implementaion will use the same mechanism OpenCL ICD dispatcher - // uses. Somthing like: + // Real implementation will use the same mechanism OpenCL ICD dispatcher + // uses. Something like: // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_INVALID_CONTEXT); // return context->dispatch->piextDeviceSelectIR( // ctx, images, num_images, selected_image); @@ -1289,7 +1269,6 @@ pi_result piContextCreate(const pi_context_properties *Properties, // L0 does not have notion of contexts. // Return the device handle (only single device is allowed) as a context // handle. - // if (NumDevices != 1) { zePrint("piCreateContext: context should have exactly one Device\n"); return PI_INVALID_VALUE; @@ -1844,7 +1823,6 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, try { // There are extra allocations/copying here dictated by the difference // in L0 and PI interfaces. - // uint32_t Count = 0; ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &Count, nullptr)); char **PNames = new char *[Count]; @@ -1883,7 +1861,6 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices, // is that this would mean moving zeModuleCreate here entirely, // and so L0 module creation would be deferred until // piProgramCompile/piProgramLink/piProgramBuild. - // assert(NumInputPrograms == 1 && InputPrograms); assert(RetProgram); *RetProgram = InputPrograms[0]; @@ -1934,7 +1911,6 @@ pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, // TODO: is this the only supported binary type in L0? // We should probably return CL_PROGRAM_BINARY_TYPE_NONE if asked // before the program was compiled. - // SET_PARAM_VALUE(cl_program_binary_type{CL_PROGRAM_BINARY_TYPE_EXECUTABLE}); } else if (ParamName == CL_PROGRAM_BUILD_OPTIONS) { // TODO: how to get module build options out of L0? @@ -1942,7 +1918,6 @@ pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, // passed with piProgramCompile/piProgramBuild, but what can we // return for programs that were built outside and registered // with piProgramRegister? - // SET_PARAM_VALUE_STR(""); } else { zePrint("piProgramGetBuildInfo: unsupported ParamName\n"); @@ -2014,7 +1989,6 @@ pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize, // We don't know the type of the argument but it seems that the only time // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. - // if (ArgSize == sizeof(void *) && ArgValue && *(void **)(const_cast(ArgValue)) == nullptr) { ArgValue = nullptr; @@ -2036,7 +2010,6 @@ pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, // extracting native PI object from PI handle, and have SYCL // RT pass that directly to the regular piKernelSetArg (and // then remove this piextKernelSetArgMemObj). - // assert(Kernel); ZE_CALL( @@ -2408,7 +2381,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // TODO: Using UINT32_MAX for timeout should have the desired // effect of waiting until the event is trigerred, but it seems that // it is causing an OS crash, so use an interruptable loop for now. - // do { ZeResult = ZE_CALL_NOCHECK(zeEventHostSynchronize(ZeEvent, 100000)); } while (ZeResult == ZE_RESULT_NOT_READY); @@ -2418,7 +2390,6 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { // NOTE: we are destroying associated command lists here to free // resources sooner in case RT is not calling piEventRelease soon enough. - // if (EventList[I]->ZeCommandList) { // Event has been signaled: Destroy the command list associated with the // call that generated the event. @@ -2454,7 +2425,6 @@ pi_result piEventRelease(pi_event Event) { if (Event->ZeCommandList) { // Destroy the command list associated with the call that generated // the event. - // ZE_CALL(zeCommandListDestroy(Event->ZeCommandList)); Event->ZeCommandList = nullptr; } @@ -2983,7 +2953,6 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap, // TODO: check if the input buffer is already allocated in shared // memory and thus is accessible from the host as is. Can we get SYCL RT // to predict/allocate in shared memory from the beginning? - // if (Buffer->MapHostPtr) { // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it. // It is also better for performance. @@ -2995,7 +2964,6 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap, // command has completed. // - The pointer value returned by clEnqueueMapBuffer will be derived from // the host_ptr specified when the buffer object is created." - // *RetMap = Buffer->MapHostPtr + Offset; } else { ze_host_mem_alloc_desc_t ZeDesc = {}; @@ -3030,7 +2998,6 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr, // TODO: handle the case when user does not care to follow the event // of unmap completion. - // assert(Event); auto Res = piEventCreate(Queue->Context, Event); @@ -3052,7 +3019,6 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr, // // NOTE: Keep this in sync with the implementation of // piEnqueueMemBufferMap/piEnqueueMemImageMap. - // _pi_mem::Mapping MapInfo = {}; if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo)) return Res; @@ -3165,7 +3131,6 @@ enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, // TODO: L0 does not support row_pitch/slice_pitch for images yet. // Check that SYCL RT did not want pitch larger than default. - // #ifndef NDEBUG assert(SrcMem->isImage()); auto SrcImage = static_cast<_pi_image *>(SrcMem); @@ -3191,7 +3156,6 @@ enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, // TODO: L0 does not support row_pitch/slice_pitch for images yet. // Check that SYCL RT did not want pitch larger than default. - // #ifndef NDEBUG assert(DstMem->isImage()); auto DstImage = static_cast<_pi_image *>(DstMem); @@ -3513,7 +3477,6 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // TODO: L0 does not have a completion "event" with the prefetch API, // so manually add command to signal our event. - // ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); if (auto Res = Queue->executeCommandList(ZeCommandList, false)) @@ -3592,7 +3555,6 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // TODO: L0 does not have a completion "event" with the advise API, // so manually add command to signal our event. - // ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); Queue->executeCommandList(ZeCommandList, false); @@ -3653,7 +3615,6 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, case PI_MEM_ALLOC_DEVICE: { // TODO: this wants pi_device, but we didn't remember it, and cannot // deduct from the L0 device. - // die("piextUSMGetMemAllocInfo: PI_MEM_ALLOC_DEVICE not implemented"); break; } @@ -3687,7 +3648,6 @@ pi_result piKernelSetExecInfo(pi_kernel Kernel, pi_kernel_exec_info ParamName, // The whole point for users really was to not need to know anything // about the types of allocations kernel uses. So in DPC++ we always // just set all 3 modes for each kernel. - // bool ZeIndirectValue = true; ZE_CALL(zeKernelSetAttribute(Kernel->ZeKernel, ZE_KERNEL_ATTR_INDIRECT_SHARED_ACCESS, diff --git a/sycl/plugins/level_zero/pi_level0.hpp b/sycl/plugins/level_zero/pi_level0.hpp index 04e23359b9f19..ad23384b56724 100755 --- a/sycl/plugins/level_zero/pi_level0.hpp +++ b/sycl/plugins/level_zero/pi_level0.hpp @@ -47,14 +47,22 @@ template <> uint32_t pi_cast(uint64_t Value) { std::terminate(); } +// Base class to store common data +struct _pi_object { + _pi_object() : RefCount{1} {} + + // L0 doesn't do the reference counting, so we have to do. + // Must be atomic to prevent data race when incrementing/decrementing. + std::atomic RefCount; +}; + // Define the types that are opaque in pi.h in a manner suitabale for L0 plugin struct _pi_platform { _pi_platform(ze_driver_handle_t Driver) : ZeDriver{Driver} {} - // L0 lacks the notion of a platform, but thert is a driver, which is a + // L0 lacks the notion of a platform, but there is a driver, which is a // pretty good fit to keep here. - // ze_driver_handle_t ZeDriver; // Cache versions info from zeDriverGetProperties. @@ -62,11 +70,11 @@ struct _pi_platform { std::string ZeDriverApiVersion; }; -struct _pi_device { +struct _pi_device : _pi_object { _pi_device(ze_device_handle_t Device, pi_platform Plt, bool isSubDevice = false) : ZeDevice{Device}, Platform{Plt}, ZeCommandListInit{nullptr}, - IsSubDevice{isSubDevice}, RefCount{1}, ZeDeviceProperties{}, + IsSubDevice{isSubDevice}, ZeDeviceProperties{}, ZeDeviceComputeProperties{} { // NOTE: one must additionally call initialize() to complete // PI device creation. @@ -92,17 +100,11 @@ struct _pi_device { // Indicates if this is a root-device or a sub-device. // Technically this information can be queried from a device handle, but it // seems better to just keep it here. - // bool IsSubDevice; - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; - // Create a new command list for executing on this device. // It's caller's responsibility to remember and destroy the created // command list when no longer needed. - // pi_result createCommandList(ze_command_list_handle_t *ze_command_list); // Cache of the immutable device properties. @@ -110,21 +112,16 @@ struct _pi_device { ze_device_compute_properties_t ZeDeviceComputeProperties; }; -struct _pi_context { +struct _pi_context : _pi_object { _pi_context(pi_device Device) - : Device{Device}, RefCount{1}, ZeEventPool{nullptr}, - NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {} + : Device{Device}, ZeEventPool{nullptr}, NumEventsAvailableInEventPool{}, + NumEventsLiveInEventPool{} {} // L0 does not have notion of contexts. // Keep the device here (must be exactly one) to return it when PI context // is queried for devices. - // pi_device Device; - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; - // Get index of the free slot in the available pool. If there is no avialble // pool then create new one. ze_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, @@ -138,21 +135,22 @@ struct _pi_context { // Following member variables are used to manage assignment of events // to event pools. // TODO: These variables may be moved to pi_device and pi_platform - // if appropriate - // Event pool to which events are being added to + // if appropriate. + + // Event pool to which events are being added to. ze_event_pool_handle_t ZeEventPool; // This map will be used to determine if a pool is full or not - // by storing number of empty slots available in the pool + // by storing number of empty slots available in the pool. std::unordered_map NumEventsAvailableInEventPool; - // This map will be used to determine number of live events in the pool - // We use separate maps for number of event slots available in the pool - // number of events live in the pool live - // This will help when we try to make the code thread-safe + // This map will be used to determine number of live events in the pool. + // We use separate maps for number of event slots available in the pool. + // number of events live in the pool live. + // This will help when we try to make the code thread-safe. std::unordered_map NumEventsLiveInEventPool; - // TODO: we'd like to create a thread safe map class instead of mutex + map, + // TODO: we'd like to create a thread safe map class instead of mutex + map, // that must be carefully used together. // Mutex to control operations on NumEventsAvailableInEventPool map. @@ -162,9 +160,9 @@ struct _pi_context { std::mutex NumEventsLiveInEventPoolMutex; }; -struct _pi_queue { +struct _pi_queue : _pi_object { _pi_queue(ze_command_queue_handle_t Queue, pi_context Context) - : ZeCommandQueue{Queue}, Context{Context}, RefCount{1} {} + : ZeCommandQueue{Queue}, Context{Context} {} // L0 command queue handle. ze_command_queue_handle_t ZeCommandQueue; @@ -172,35 +170,24 @@ struct _pi_queue { // Keeps the PI context to which this queue belongs. pi_context Context; - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; - // Attach a command list to this queue, close, and execute it. // Note that this command list cannot be appended to after this. // The "is_blocking" tells if the wait for completion is requested. - // pi_result executeCommandList(ze_command_list_handle_t ZeCommandList, bool is_blocking = false); }; -struct _pi_mem { +struct _pi_mem : _pi_object { // Keeps the PI platform of this memory handle. pi_platform Platform; // Keeps the host pointer where the buffer will be mapped to, // if created with PI_MEM_FLAGS_HOST_PTR_USE (see // piEnqueueMemBufferMap for details). - // char *MapHostPtr; - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; - // Supplementary data to keep track of the mappings of this memory // created with piEnqueueMemBufferMap and piEnqueueMemImageMap. - // struct Mapping { // The offset in the buffer giving the start of the mapped region. size_t Offset; @@ -208,27 +195,30 @@ struct _pi_mem { size_t Size; }; - virtual ~_pi_mem() = default; + // Interface of the _pi_mem object - // Interface of the _pi_mem object. + // Get the L0 handle of the current memory object virtual void *getZeHandle() = 0; + // Get a pointer to the L0 handle of the current memory object virtual void *getZeHandlePtr() = 0; + // Method to get type of the derived object (image or buffer) virtual bool isImage() const = 0; + virtual ~_pi_mem() = default; + // Thread-safe methods to work with memory mappings pi_result addMapping(void *MappedTo, size_t Size, size_t Offset); pi_result removeMapping(void *MappedTo, Mapping &MapInfo); protected: _pi_mem(pi_platform Plt, char *HostPtr) - : Platform{Plt}, MapHostPtr{HostPtr}, RefCount{1}, Mappings{} {} + : Platform{Plt}, MapHostPtr{HostPtr}, Mappings{} {} private: // The key is the host pointer representing an active mapping. // The value is the information needed to maintain/undo the mapping. - // std::unordered_map Mappings; // TODO: we'd like to create a thread safe map class instead of mutex + map, @@ -253,7 +243,6 @@ struct _pi_buffer final : _pi_mem { // L0 memory handle is really just a naked pointer. // It is just convenient to have it char * to simplify offset arithmetics. - // char *ZeMem; struct { @@ -283,12 +272,11 @@ struct _pi_image final : _pi_mem { ze_image_handle_t ZeImage; }; -struct _pi_event { +struct _pi_event : _pi_object { _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool, pi_context Context, pi_command_type CommandType) : ZeEvent{ZeEvent}, ZeEventPool{ZeEventPool}, ZeCommandList{nullptr}, - CommandType{CommandType}, Context{Context}, - CommandData{nullptr}, RefCount{1} {} + CommandType{CommandType}, Context{Context}, CommandData{nullptr} {} // L0 event handle. ze_event_handle_t ZeEvent; @@ -298,72 +286,55 @@ struct _pi_event { // L0 command list where the command signaling this event was appended to. // This is currently used to remember/destroy the command list after // all commands in it are completed, i.e. this event signaled. - // ze_command_list_handle_t ZeCommandList; // Keeps the command-queue and command associated with the event. // These are NULL for the user events. pi_queue Queue; pi_command_type CommandType; - // Provide direct access to Context, instead of going via queue + // Provide direct access to Context, instead of going via queue. // Not every PI event has a queue, and we need a handle to Context - // to get to event pool related information + // to get to event pool related information. pi_context Context; // Opaque data to hold any data needed for CommandType. void *CommandData; - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; - // Methods for translating PI events list into L0 events list static ze_event_handle_t *createZeEventList(pi_uint32, const pi_event *); static void deleteZeEventList(ze_event_handle_t *); }; -struct _pi_program { +struct _pi_program : _pi_object { _pi_program(ze_module_handle_t Module, pi_context Context) - : ZeModule{Module}, Context{Context}, RefCount{1} {} + : ZeModule{Module}, Context{Context} {} // L0 module handle. ze_module_handle_t ZeModule; // Keep the context of the program. pi_context Context; - - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; }; -struct _pi_kernel { +struct _pi_kernel : _pi_object { _pi_kernel(ze_kernel_handle_t Kernel, pi_program Program) - : ZeKernel{Kernel}, Program{Program}, RefCount{1} {} + : ZeKernel{Kernel}, Program{Program} {} // L0 function handle. ze_kernel_handle_t ZeKernel; // Keep the program of the kernel. pi_program Program; - - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; }; -struct _pi_sampler { - _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler}, RefCount{1} {} +struct _pi_sampler : _pi_object { + _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {} // L0 sampler handle. // TODO: It is important that L0 handler is the first data member. Workaround // in SYCL RT (in ExecCGCommand::enqueueImp()) relies on this. This comment // should be removed when workaround in SYCL runtime will be removed. ze_sampler_handle_t ZeSampler; - - // L0 doesn't do the reference counting, so we have to do. - // Must be atomic to prevent data race when incrementing/decrementing. - std::atomic RefCount; }; #endif // PI_LEVEL0_HPP diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 735d947572fa3..80b2ea06a399b 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -212,11 +212,11 @@ bool findPlugins(vector_class> &PluginNames) { // search is done for libpi_opencl.so/pi_opencl.dll file in LD_LIBRARY_PATH // env only. // - PluginNames.push_back(std::make_pair(OPENCL_PLUGIN_NAME, - backend::opencl)); - PluginNames.push_back(std::make_pair(LEVEL0_PLUGIN_NAME, - backend::level0)); - PluginNames.push_back( + PluginNames.emplace_back(std::make_pair( + OPENCL_PLUGIN_NAME, backend::opencl)); + PluginNames.emplace_back(std::make_pair( + LEVEL0_PLUGIN_NAME, backend::level0)); + PluginNames.emplace_back( std::make_pair(CUDA_PLUGIN_NAME, backend::cuda)); return true; } From d76ea58255416af78eedd7fa6a3fea0deb67248b Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 21 May 2020 17:53:21 -0700 Subject: [PATCH 14/21] Address review comments * Get rid of SET_PARAM_VALUE* macro * Fix emplace_back usage * If L0 loader is downloaded github then add it to deploy-sycl-toolchain * Added ability to provide local version of the L0 headers and loader Signed-off-by: Artur Gainullin --- buildbot/configure.py | 10 + sycl/plugins/level_zero/CMakeLists.txt | 85 ++-- sycl/plugins/level_zero/pi_level0.cpp | 520 +++++++++++-------------- sycl/source/detail/pi.cpp | 9 +- 4 files changed, 298 insertions(+), 326 deletions(-) diff --git a/buildbot/configure.py b/buildbot/configure.py index dbfcf989f4d1f..d002d14504791 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -82,6 +82,14 @@ def do_configure(args): "-DOpenCL_INCLUDE_DIR={}".format(ocl_header_dir), "-DOpenCL_LIBRARY={}".format(icd_loader_lib)]) + if args.l0_headers and args.l0_loader: + cmake_cmd.extend([ + "-DL0_INCLUDE_DIR={}".format(args.l0_headers), + "-DL0_LIBRARY={}".format(args.l0_loader)]) + elif args.l0_headers or args.l0_loader: + sys.exit("Please specify both Level Zero headers and loader or don't specify " + "none of them to let download from github.com") + # Add additional CMake options if provided if args.cmake_opt: cmake_cmd += args.cmake_opt @@ -115,6 +123,8 @@ def main(): # User options parser.add_argument("-s", "--src-dir", metavar="SRC_DIR", help="source directory (autodetected by default)") parser.add_argument("-o", "--obj-dir", metavar="OBJ_DIR", help="build directory. (/build by default)") + parser.add_argument("--l0-headers", metavar="L0_HEADER_DIR", help="directory with Level Zero headers") + parser.add_argument("--l0-loader", metavar="L0_LOADER", help="path to the Level Zero loader") parser.add_argument("-t", "--build-type", metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release") parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA") diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index f060d6fa9e9e7..62a6284571251 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -1,41 +1,52 @@ # PI Level0 plugin library -message(STATUS "Download Level Zero loader and headers from github.com") -if(MSVC) - set(L0_LIBRARY - "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_loader${CMAKE_STATIC_LIBRARY_SUFFIX}") +if (NOT DEFINED L0_LIBRARY OR NOT DEFINED L0_INCLUDE_DIR) + message(STATUS "Download Level Zero loader and headers from github.com") + if(MSVC) + set(L0_LIBRARY + "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_loader${CMAKE_STATIC_LIBRARY_SUFFIX}") + else() + set(L0_LIBRARY + "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}ze_loader${CMAKE_SHARED_LIBRARY_SUFFIX}") + endif() + if (CMAKE_C_COMPILER) + list(APPEND AUX_CMAKE_FLAGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}) + endif() + if (CMAKE_CXX_COMPILER) + list(APPEND AUX_CMAKE_FLAGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) + endif() + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build) + ExternalProject_Add(l0-loader + GIT_REPOSITORY https://github.com/oneapi-src/level-zero.git + GIT_TAG origin/master + SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/Level0/l0_loader" + BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build" + INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_install" + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} + -DOpenCL_INCLUDE_DIR=${OpenCL_INCLUDE_DIRS} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_INSTALL_LIBDIR:PATH=lib${LLVM_LIBDIR_SUFFIX} + ${AUX_CMAKE_FLAGS} + STEP_TARGETS configure,build,install + DEPENDS ocl-headers + BUILD_BYPRODUCTS ${L0_LIBRARY} + ) + ExternalProject_Add_Step(l0-loader llvminstall + COMMAND ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR} + COMMENT "Installing l0-loader into the LLVM binary directory" + DEPENDEES install + ) + + install(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_install/" + DESTINATION "." + COMPONENT l0-loader + ) + + list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS l0-loader) else() - set(L0_LIBRARY - "${LLVM_LIBRARY_OUTPUT_INTDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}ze_loader${CMAKE_SHARED_LIBRARY_SUFFIX}") + include_directories("${L0_INCLUDE_DIR}") endif() -if (CMAKE_C_COMPILER) - list(APPEND AUX_CMAKE_FLAGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}) -endif() -if (CMAKE_CXX_COMPILER) - list(APPEND AUX_CMAKE_FLAGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}) -endif() -file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build) -ExternalProject_Add(l0-loader - GIT_REPOSITORY https://github.com/oneapi-src/level-zero.git - GIT_TAG origin/master - SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/Level0/l0_loader" - BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_build" - INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/l0_loader_install" - CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM} - -DOpenCL_INCLUDE_DIR=${OpenCL_INCLUDE_DIRS} - -DCMAKE_INSTALL_PREFIX= - -DCMAKE_INSTALL_LIBDIR:PATH=lib${LLVM_LIBDIR_SUFFIX} - ${AUX_CMAKE_FLAGS} - STEP_TARGETS configure,build,install - DEPENDS ocl-headers - BUILD_BYPRODUCTS ${L0_LIBRARY} -) -ExternalProject_Add_Step(l0-loader llvminstall - COMMAND ${CMAKE_COMMAND} -E copy_directory / ${LLVM_BINARY_DIR} - COMMENT "Installing l0-loader into the LLVM binary directory" - DEPENDEES install -) include_directories("${sycl_inc_dir}") include_directories(${OPENCL_INCLUDE}) @@ -66,8 +77,10 @@ else() ) endif() -add_dependencies(pi_level0 l0-loader) -add_dependencies(sycl-toolchain pi_level0) +if (l0-loader) + add_dependencies(pi_level0 l0-loader) +endif() + add_dependencies(sycl-toolchain pi_level0) target_link_libraries(pi_level0 PRIVATE "${L0_LIBRARY}") if (UNIX) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 7f7997885aa3c..c20a0d5d2a3b2 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -19,9 +19,12 @@ #include #include #include +#include #include +namespace { + // Controls L0 calls serialization to w/a L0 driver being not MT ready. // Recognized values (can be used as a bit mask): enum { @@ -69,6 +72,74 @@ static void zePrint(const char *Format, ...) { } } +template +pi_result getInfoImpl(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value, size_t value_size, + Assign &&assign_func) { + + if (param_value != nullptr) { + + if (param_value_size < value_size) { + return PI_INVALID_VALUE; + } + + assign_func(param_value, value, value_size); + } + + if (param_value_size_ret != nullptr) { + *param_value_size_ret = value_size; + } + + return PI_SUCCESS; +} + +template +pi_result getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value) { + + auto assignment = [](void *param_value, T value, size_t value_size) { + *static_cast(param_value) = value; + }; + + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + sizeof(T), assignment); +} + +template +pi_result getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + T *value) { + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + array_length * sizeof(T), memcpy); +} + +template <> +pi_result getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, + const char *value) { + return getInfoArray(strlen(value) + 1, param_value_size, param_value, + param_value_size_ret, value); +} + +class ReturnHelper { +public: + ReturnHelper(size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + : param_value_size(param_value_size), param_value(param_value), + param_value_size_ret(param_value_size_ret) {} + + template pi_result operator()(const T &t) { + return getInfo(param_value_size, param_value, param_value_size_ret, t); + } + +private: + size_t param_value_size; + void *param_value; + size_t *param_value_size_ret; +}; + +} // anonymous namespace + // TODO:: In the following 4 methods we may want to distinguish read access vs. // write (as it is OK for multiple threads to read the map without locking it). @@ -119,7 +190,6 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, // and initialization of the record in NumEventsLiveInEventPool must be done // atomically. Otherwise it is possible that decrementAliveEventsInPool will // be called for the record in NumEventsLiveInEventPool before its - // initialization. std::lock(NumEventsAvailableInEventPoolMutex, NumEventsLiveInEventPoolMutex); std::lock_guard NumEventsAvailableInEventPoolGuard( @@ -361,41 +431,6 @@ void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { delete[] ZeEventList; } -// No generic lambdas in C++11, so use this convinence macro. -// NOTE: to be used in API returning "ParamValue". -// NOTE: memset is used to clear all bytes in the memory allocated by SYCL RT -// for value. This is a workaround for the problem when return type of the -// parameter is incorrect in L0 plugin which can result in bad value. This -// memset can be removed if it is necessary. -#define SET_PARAM_VALUE(Value) \ - { \ - typedef decltype(Value) T; \ - if (ParamValue) { \ - memset(ParamValue, 0, ParamValueSize); \ - *(T *)ParamValue = Value; \ - } \ - if (ParamValueSizeRet) \ - *ParamValueSizeRet = sizeof(T); \ - } -#define SET_PARAM_VALUE_STR(Value) \ - { \ - if (ParamValue) \ - memcpy(ParamValue, Value, ParamValueSize); \ - if (ParamValueSizeRet) \ - *ParamValueSizeRet = strlen(Value) + 1; \ - } - -#define SET_PARAM_VALUE_VLA(Value, NumValues, RetType) \ - { \ - if (ParamValue) { \ - memset(ParamValue, 0, ParamValueSize); \ - for (uint32_t I = 0; I < NumValues; I++) \ - ((RetType *)ParamValue)[I] = (RetType)Value[I]; \ - } \ - if (ParamValueSizeRet) \ - *ParamValueSizeRet = NumValues * sizeof(RetType); \ - } - #ifndef _WIN32 // Recover from Linux SIGSEGV signal. // We can't reliably catch C++ exceptions thrown from signal @@ -459,7 +494,7 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, // This is a good time to initialize L0. // We can still safely recover if something goes wrong during the init. __TRY() { - // We should not call zeInit multiples times ever. + // We should not call zeInit multiple times ever. try { std::call_once(OnceFlag, [&ZeResult]() { ZeResult = ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); @@ -551,15 +586,15 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, zePrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str()); zePrint("==========================\n"); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + switch (ParamName) { case PI_PLATFORM_INFO_NAME: // TODO: Query L0 driver when relevant info is added there. - SET_PARAM_VALUE_STR("Intel(R) Level-Zero"); - break; + return ReturnValue("Intel(R) Level-Zero"); case PI_PLATFORM_INFO_VENDOR: // TODO: Query L0 driver when relevant info is added there. - SET_PARAM_VALUE_STR("Intel(R) Corporation"); - break; + return ReturnValue("Intel(R) Corporation"); case PI_PLATFORM_INFO_EXTENSIONS: // Convention adopted from OpenCL: // "Returns a space-separated list of extension names (the extension @@ -570,12 +605,10 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, // TODO: Check the common extensions supported by all connected devices and // return them. For now, hardcoding some extensions we know are supported by // all Level0 devices. - SET_PARAM_VALUE_STR(ZE_SUPPORTED_EXTENSIONS); - break; + return ReturnValue(ZE_SUPPORTED_EXTENSIONS); case PI_PLATFORM_INFO_PROFILE: // TODO: figure out what this means and how is this used - SET_PARAM_VALUE_STR("FULL_PROFILE"); - break; + return ReturnValue("FULL_PROFILE"); case PI_PLATFORM_INFO_VERSION: // TODO: this should query to zeDriverGetDriverVersion // but we don't yet have the driver handle here. @@ -583,8 +616,8 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, // From OpenCL 2.1: "This version string has the following format: // OpenCL. Follow the same notation here. - SET_PARAM_VALUE_STR(Platform->ZeDriverApiVersion.c_str()); - break; + // + return ReturnValue(Platform->ZeDriverApiVersion.c_str()); default: // TODO: implement other parameters die("Unsupported ParamName in piPlatformGetInfo"); @@ -704,26 +737,23 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, ZeDeviceCacheProperties.version = ZE_DEVICE_CACHE_PROPERTIES_VERSION_CURRENT; ZE_CALL(zeDeviceGetCacheProperties(ZeDevice, &ZeDeviceCacheProperties)); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + switch (ParamName) { case PI_DEVICE_INFO_TYPE: { - if (Device->ZeDeviceProperties.type == ZE_DEVICE_TYPE_GPU) { - SET_PARAM_VALUE(PI_DEVICE_TYPE_GPU); - } else { // ZE_DEVICE_TYPE_FPGA - zePrint("FPGA not supported\n"); + if (Device->ZeDeviceProperties.type != ZE_DEVICE_TYPE_GPU) { + zePrint("This device type is not supported\n"); return PI_INVALID_VALUE; } - break; + return ReturnValue(PI_DEVICE_TYPE_GPU); } case PI_DEVICE_INFO_PARENT_DEVICE: // TODO: all L0 devices are parent ? - SET_PARAM_VALUE(pi_device{0}); - break; + return ReturnValue(pi_device{0}); case PI_DEVICE_INFO_PLATFORM: - SET_PARAM_VALUE(Device->Platform); - break; + return ReturnValue(Device->Platform); case PI_DEVICE_INFO_VENDOR_ID: - SET_PARAM_VALUE(pi_uint32{Device->ZeDeviceProperties.vendorId}); - break; + return ReturnValue(pi_uint32{Device->ZeDeviceProperties.vendorId}); case PI_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: // "Returns a space separated list of extension names (the extension @@ -732,6 +762,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // TODO: Use proper mechanism to get this information from Level0 after // it is added to Level0. // Hardcoding the few we know are supported by the current hardware. + // + // std::string SupportedExtensions; // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIRV support. Core @@ -766,50 +798,40 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, // Supports reading and writing of images. SupportedExtensions += ("cl_khr_3d_image_writes "); - SET_PARAM_VALUE_STR(SupportedExtensions.c_str()); - break; + return ReturnValue(SupportedExtensions.c_str()); } case PI_DEVICE_INFO_NAME: - SET_PARAM_VALUE_STR(Device->ZeDeviceProperties.name); - break; + return ReturnValue(Device->ZeDeviceProperties.name); case PI_DEVICE_INFO_COMPILER_AVAILABLE: - SET_PARAM_VALUE(pi_bool{1}); - break; + return ReturnValue(pi_bool{1}); case PI_DEVICE_INFO_LINKER_AVAILABLE: - SET_PARAM_VALUE(pi_bool{1}); - break; + return ReturnValue(pi_bool{1}); case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: { pi_uint32 MaxComputeUnits = Device->ZeDeviceProperties.numEUsPerSubslice * Device->ZeDeviceProperties.numSubslicesPerSlice * Device->ZeDeviceProperties.numSlices; - SET_PARAM_VALUE(pi_uint32{MaxComputeUnits}); - break; + return ReturnValue(pi_uint32{MaxComputeUnits}); } case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: // L0 spec defines only three dimensions - SET_PARAM_VALUE(pi_uint32{3}); - break; + return ReturnValue(pi_uint32{3}); case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: - SET_PARAM_VALUE( + return ReturnValue( pi_uint64{Device->ZeDeviceComputeProperties.maxTotalGroupSize}); - break; case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { struct { size_t Arr[3]; } MaxGroupSize = {{Device->ZeDeviceComputeProperties.maxGroupSizeX, Device->ZeDeviceComputeProperties.maxGroupSizeY, Device->ZeDeviceComputeProperties.maxGroupSizeZ}}; - SET_PARAM_VALUE(MaxGroupSize); - break; + return ReturnValue(MaxGroupSize); } case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: - SET_PARAM_VALUE(pi_uint32{Device->ZeDeviceProperties.coreClockRate}); - break; + return ReturnValue(pi_uint32{Device->ZeDeviceProperties.coreClockRate}); case PI_DEVICE_INFO_ADDRESS_BITS: { // TODO: To confirm with spec. - SET_PARAM_VALUE(pi_uint32{64}); - break; + return ReturnValue(pi_uint32{64}); } case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: { // TODO: To confirm with spec. @@ -817,50 +839,40 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, for (uint32_t I = 0; I < ZeAvailMemCount; I++) { MaxMemAllocSize += ZeDeviceMemoryProperties[I].totalSize; } - SET_PARAM_VALUE(pi_uint64{MaxMemAllocSize}); - break; + return ReturnValue(pi_uint64{MaxMemAllocSize}); } case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: { uint32_t GlobalMemSize = 0; for (uint32_t I = 0; I < ZeAvailMemCount; I++) { GlobalMemSize += ZeDeviceMemoryProperties[I].totalSize; } - SET_PARAM_VALUE(pi_uint64{GlobalMemSize}); - break; + return ReturnValue(pi_uint64{GlobalMemSize}); } case PI_DEVICE_INFO_LOCAL_MEM_SIZE: - SET_PARAM_VALUE( + return ReturnValue( pi_uint64{Device->ZeDeviceComputeProperties.maxSharedLocalMemory}); - break; case PI_DEVICE_INFO_IMAGE_SUPPORT: - SET_PARAM_VALUE(pi_bool{ZeDeviceImageProperties.supported}); - break; + return ReturnValue(pi_bool{ZeDeviceImageProperties.supported}); case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: - SET_PARAM_VALUE(pi_bool{Device->ZeDeviceProperties.unifiedMemorySupported}); - break; + return ReturnValue( + pi_bool{Device->ZeDeviceProperties.unifiedMemorySupported}); case PI_DEVICE_INFO_AVAILABLE: - SET_PARAM_VALUE(pi_bool{ZeDevice ? true : false}); - break; + return ReturnValue(pi_bool{ZeDevice ? true : false}); case PI_DEVICE_INFO_VENDOR: // TODO: Level-Zero does not return vendor's name at the moment // only the ID. - SET_PARAM_VALUE_STR("Intel(R) Corporation"); - break; + return ReturnValue("Intel(R) Corporation"); case PI_DEVICE_INFO_DRIVER_VERSION: - SET_PARAM_VALUE_STR(Device->Platform->ZeDriverVersion.c_str()); - break; + return ReturnValue(Device->Platform->ZeDriverVersion.c_str()); case PI_DEVICE_INFO_VERSION: - SET_PARAM_VALUE_STR(Device->Platform->ZeDriverApiVersion.c_str()); - break; + return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str()); case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { uint32_t ZeSubDeviceCount = 0; ZE_CALL(zeDeviceGetSubDevices(ZeDevice, &ZeSubDeviceCount, nullptr)); - SET_PARAM_VALUE(pi_uint32{ZeSubDeviceCount}); - break; + return ReturnValue(pi_uint32{ZeSubDeviceCount}); } case PI_DEVICE_INFO_REFERENCE_COUNT: - SET_PARAM_VALUE(pi_uint32{Device->RefCount}); - break; + return ReturnValue(pi_uint32{Device->RefCount}); case PI_DEVICE_INFO_PARTITION_PROPERTIES: { // It is debatable if SYCL sub-device and partitioning APIs sufficient to // expose Level0 sub-devices? We start with support of @@ -870,13 +882,11 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, struct { pi_device_partition_property Arr[2]; } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}}; - SET_PARAM_VALUE(PartitionProperties); - break; + return ReturnValue(PartitionProperties); } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: - SET_PARAM_VALUE(pi_device_affinity_domain{ + return ReturnValue(pi_device_affinity_domain{ PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); - break; case PI_DEVICE_INFO_PARTITION_TYPE: { if (Device->IsSubDevice) { struct { @@ -884,86 +894,64 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, 0}}; - SET_PARAM_VALUE(PartitionProperties); - } else { - // For root-device there is no partitioning to report. - SET_PARAM_VALUE(pi_device_partition_property{0}); + return ReturnValue(PartitionProperties); } - break; + // For root-device there is no partitioning to report. + return ReturnValue(pi_device_partition_property{0}); } // Everything under here is not supported yet case PI_DEVICE_INFO_OPENCL_C_VERSION: - SET_PARAM_VALUE_STR(""); - break; + return ReturnValue(""); case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: - SET_PARAM_VALUE(pi_bool{true}); - break; + return ReturnValue(pi_bool{true}); case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: - SET_PARAM_VALUE(size_t{ZeDeviceKernelProperties.printfBufferSize}); - break; + return ReturnValue(size_t{ZeDeviceKernelProperties.printfBufferSize}); case PI_DEVICE_INFO_PROFILE: - SET_PARAM_VALUE_STR("FULL_PROFILE"); - break; + return ReturnValue("FULL_PROFILE"); case PI_DEVICE_INFO_BUILT_IN_KERNELS: // TODO: To find out correct value - SET_PARAM_VALUE_STR(""); - break; + return ReturnValue(""); case PI_DEVICE_INFO_QUEUE_PROPERTIES: - SET_PARAM_VALUE(pi_queue_properties{PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | - PI_QUEUE_PROFILING_ENABLE}); - break; + return ReturnValue(pi_queue_properties{ + PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | PI_QUEUE_PROFILING_ENABLE}); case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: - SET_PARAM_VALUE( + return ReturnValue( pi_device_exec_capabilities{PI_DEVICE_EXEC_CAPABILITIES_NATIVE_KERNEL}); - break; case PI_DEVICE_INFO_ENDIAN_LITTLE: - SET_PARAM_VALUE(pi_bool{true}); - break; + return ReturnValue(pi_bool{true}); case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: - SET_PARAM_VALUE(pi_bool{Device->ZeDeviceProperties.eccMemorySupported}); - break; + return ReturnValue(pi_bool{Device->ZeDeviceProperties.eccMemorySupported}); case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: - SET_PARAM_VALUE(size_t{Device->ZeDeviceProperties.timerResolution}); - break; + return ReturnValue(size_t{Device->ZeDeviceProperties.timerResolution}); case PI_DEVICE_INFO_LOCAL_MEM_TYPE: - SET_PARAM_VALUE(PI_DEVICE_LOCAL_MEM_TYPE_LOCAL); - break; + return ReturnValue(PI_DEVICE_LOCAL_MEM_TYPE_LOCAL); case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: - SET_PARAM_VALUE(pi_uint32{64}); - break; + return ReturnValue(pi_uint32{64}); case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: - SET_PARAM_VALUE(pi_uint64{ZeDeviceImageProperties.maxImageBufferSize}); - break; + return ReturnValue(pi_uint64{ZeDeviceImageProperties.maxImageBufferSize}); case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: - SET_PARAM_VALUE(PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); - break; + return ReturnValue(PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: - SET_PARAM_VALUE(pi_uint32{ZeDeviceCacheProperties.lastLevelCachelineSize}); - break; + return ReturnValue( + pi_uint32{ZeDeviceCacheProperties.lastLevelCachelineSize}); case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: - SET_PARAM_VALUE(pi_uint64{ZeDeviceCacheProperties.lastLevelCacheSize}); - break; + return ReturnValue(pi_uint64{ZeDeviceCacheProperties.lastLevelCacheSize}); case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: - SET_PARAM_VALUE(size_t{ZeDeviceKernelProperties.maxArgumentsSize}); - break; + return ReturnValue(size_t{ZeDeviceKernelProperties.maxArgumentsSize}); case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: // SYCL/OpenCL spec is vague on what this means exactly, but seems to // be for "alignment requirement (in bits) for sub-buffer offsets." // An OpenCL implementation returns 8*128, but L0 can do just 8, // meaning unaligned access for values of types larger than 8 bits. - SET_PARAM_VALUE(pi_uint32{8}); - break; + return ReturnValue(pi_uint32{8}); case PI_DEVICE_INFO_MAX_SAMPLERS: - SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxSamplers}); - break; + return ReturnValue(pi_uint32{ZeDeviceImageProperties.maxSamplers}); case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: - SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxReadImageArgs}); - break; + return ReturnValue(pi_uint32{ZeDeviceImageProperties.maxReadImageArgs}); case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: - SET_PARAM_VALUE(pi_uint32{ZeDeviceImageProperties.maxWriteImageArgs}); - break; + return ReturnValue(pi_uint32{ZeDeviceImageProperties.maxWriteImageArgs}); case PI_DEVICE_INFO_SINGLE_FP_CONFIG: { uint64_t SingleFPValue = 0; ze_fp_capabilities_t ZeSingleFPCapabilities = @@ -986,8 +974,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ZE_FP_CAPS_FMA & ZeSingleFPCapabilities) { SingleFPValue |= PI_FP_FMA; } - SET_PARAM_VALUE(pi_uint64{SingleFPValue}); - break; + return ReturnValue(pi_uint64{SingleFPValue}); } case PI_DEVICE_INFO_HALF_FP_CONFIG: { uint64_t HalfFPValue = 0; @@ -1011,8 +998,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ZE_FP_CAPS_FMA & ZeHalfFPCapabilities) { HalfFPValue |= PI_FP_FMA; } - SET_PARAM_VALUE(pi_uint64{HalfFPValue}); - break; + return ReturnValue(pi_uint64{HalfFPValue}); } case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: { uint64_t DoubleFPValue = 0; @@ -1036,70 +1022,55 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ZE_FP_CAPS_FMA & ZeDoubleFPCapabilities) { DoubleFPValue |= PI_FP_FMA; } - SET_PARAM_VALUE(pi_uint64{DoubleFPValue}); - break; + return ReturnValue(pi_uint64{DoubleFPValue}); } case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - SET_PARAM_VALUE(size_t{8192}); - break; + return ReturnValue(size_t{8192}); case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - SET_PARAM_VALUE(size_t{8192}); - break; + return ReturnValue(size_t{8192}); case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - SET_PARAM_VALUE(size_t{2048}); - break; + return ReturnValue(size_t{2048}); case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - SET_PARAM_VALUE(size_t{2048}); - break; + return ReturnValue(size_t{2048}); case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: // Until L0 provides needed info, hardcode default minimum values required // by the SYCL specification. - SET_PARAM_VALUE(size_t{2048}); - break; + return ReturnValue(size_t{2048}); case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: - SET_PARAM_VALUE(size_t{ZeDeviceImageProperties.maxImageBufferSize}); - break; + return ReturnValue(size_t{ZeDeviceImageProperties.maxImageBufferSize}); case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: - SET_PARAM_VALUE(size_t{ZeDeviceImageProperties.maxImageArraySlices}); - break; + return ReturnValue(size_t{ZeDeviceImageProperties.maxImageArraySlices}); // Handle SIMD widths. // TODO: can we do better than this? case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 1); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 1); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 4); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 8); case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: - SET_PARAM_VALUE(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); - break; + return ReturnValue(Device->ZeDeviceProperties.physicalEUSimdWidth / 2); case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes); uint32_t MinSubGroupSize = @@ -1109,22 +1080,19 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (MinSubGroupSize > Device->ZeDeviceComputeProperties.subGroupSizes[I]) MinSubGroupSize = Device->ZeDeviceComputeProperties.subGroupSizes[I]; } - SET_PARAM_VALUE(Device->ZeDeviceComputeProperties.maxTotalGroupSize / - MinSubGroupSize); - break; + return ReturnValue(Device->ZeDeviceComputeProperties.maxTotalGroupSize / + MinSubGroupSize); } case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // TODO: Not supported yet. Needs to be updated after support is added. - SET_PARAM_VALUE(pi_bool{false}); - break; + return ReturnValue(pi_bool{false}); } case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the // expected return is size_t datatype. size_t can be 8 bytes of data. - SET_PARAM_VALUE_VLA(Device->ZeDeviceComputeProperties.subGroupSizes, - Device->ZeDeviceComputeProperties.numSubGroupSizes, - size_t); - break; + return getInfoArray(Device->ZeDeviceComputeProperties.numSubGroupSizes, + ParamValueSize, ParamValue, ParamValueSizeRet, + Device->ZeDeviceComputeProperties.subGroupSizes); } case PI_DEVICE_INFO_IL_VERSION: { // Set to a space separated list of IL version strings of the form @@ -1140,8 +1108,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, SpirvVersionMinor); // returned string to contain only len number of characters. std::string ILVersion(SpirvVersionString, Len); - SET_PARAM_VALUE_STR(ILVersion.c_str()); - break; + return ReturnValue(ILVersion.c_str()); } case PI_DEVICE_INFO_USM_HOST_SUPPORT: case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: @@ -1154,8 +1121,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, Supported = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS; } - SET_PARAM_VALUE(Supported); - break; + return ReturnValue(Supported); } default: zePrint("Unsupported ParamName in piGetDeviceInfo\n"); @@ -1293,13 +1259,15 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, assert(Context); - if (ParamName == PI_CONTEXT_INFO_DEVICES) { - SET_PARAM_VALUE(Context->Device); - } else if (ParamName == PI_CONTEXT_INFO_NUM_DEVICES) { - SET_PARAM_VALUE(pi_uint32{1}); - } else if (ParamName == PI_CONTEXT_INFO_REFERENCE_COUNT) { - SET_PARAM_VALUE(pi_uint32{Context->RefCount}); - } else { + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + switch (ParamName) { + case PI_CONTEXT_INFO_DEVICES: + return ReturnValue(Context->Device); + case PI_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(pi_uint32{1}); + case PI_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(pi_uint32{Context->RefCount}); + default: // TODO: implement other parameters die("piGetContextInfo: unsuppported ParamName."); } @@ -1390,17 +1358,15 @@ pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, assert(Queue); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); // TODO: consider support for queue properties and size switch (ParamName) { case PI_QUEUE_INFO_CONTEXT: - SET_PARAM_VALUE(Queue->Context); - break; + return ReturnValue(Queue->Context); case PI_QUEUE_INFO_DEVICE: - SET_PARAM_VALUE(Queue->Context->Device); - break; + return ReturnValue(Queue->Context->Device); case PI_QUEUE_INFO_REFERENCE_COUNT: - SET_PARAM_VALUE(pi_uint32{Queue->RefCount}); - break; + return ReturnValue(pi_uint32{Queue->RefCount}); case PI_QUEUE_INFO_PROPERTIES: die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n"); break; @@ -1789,23 +1755,20 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, size_t *ParamValueSizeRet) { assert(Program); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_PROGRAM_INFO_REFERENCE_COUNT: - SET_PARAM_VALUE(pi_uint32{Program->RefCount}); - break; + return ReturnValue(pi_uint32{Program->RefCount}); case PI_PROGRAM_INFO_NUM_DEVICES: // L0 Module is always for a single device. - SET_PARAM_VALUE(pi_uint32{1}); - break; + return ReturnValue(pi_uint32{1}); case PI_PROGRAM_INFO_DEVICES: - SET_PARAM_VALUE(Program->Context->Device); - break; + return ReturnValue(Program->Context->Device); case PI_PROGRAM_INFO_BINARY_SIZES: { size_t SzBinary = 0; ZE_CALL(zeModuleGetNativeBinary(Program->ZeModule, &SzBinary, nullptr)); // This is an array of 1 element, initialize if it were scalar. - SET_PARAM_VALUE(size_t{SzBinary}); - break; + return ReturnValue(size_t{SzBinary}); } case PI_PROGRAM_INFO_BINARIES: { size_t SzBinary = 0; @@ -1816,8 +1779,7 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, case PI_PROGRAM_INFO_NUM_KERNELS: { uint32_t NumKernels = 0; ZE_CALL(zeModuleGetKernelNames(Program->ZeModule, &NumKernels, nullptr)); - SET_PARAM_VALUE(size_t{NumKernels}); - break; + return ReturnValue(size_t{NumKernels}); } case PI_PROGRAM_INFO_KERNEL_NAMES: try { @@ -1834,13 +1796,12 @@ pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, PINames += PNames[I]; } delete[] PNames; - SET_PARAM_VALUE_STR(PINames.c_str()); + return ReturnValue(PINames.c_str()); } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { return PI_ERROR_UNKNOWN; } - break; default: die("piProgramGetInfo: not implemented"); } @@ -1907,23 +1868,24 @@ pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); if (ParamName == CL_PROGRAM_BINARY_TYPE) { // TODO: is this the only supported binary type in L0? // We should probably return CL_PROGRAM_BINARY_TYPE_NONE if asked // before the program was compiled. - SET_PARAM_VALUE(cl_program_binary_type{CL_PROGRAM_BINARY_TYPE_EXECUTABLE}); - } else if (ParamName == CL_PROGRAM_BUILD_OPTIONS) { + return ReturnValue( + cl_program_binary_type{CL_PROGRAM_BINARY_TYPE_EXECUTABLE}); + } + if (ParamName == CL_PROGRAM_BUILD_OPTIONS) { // TODO: how to get module build options out of L0? // For the programs that we compiled we can remember the options // passed with piProgramCompile/piProgramBuild, but what can we // return for programs that were built outside and registered // with piProgramRegister? - SET_PARAM_VALUE_STR(""); - } else { - zePrint("piProgramGetBuildInfo: unsupported ParamName\n"); - return PI_INVALID_VALUE; + return ReturnValue(""); } - return PI_SUCCESS; + zePrint("piProgramGetBuildInfo: unsupported ParamName\n"); + return PI_INVALID_VALUE; } pi_result piProgramRetain(pi_program Program) { @@ -2028,22 +1990,18 @@ pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, ZeKernelProperties.version = ZE_KERNEL_PROPERTIES_VERSION_CURRENT; ZE_CALL(zeKernelGetProperties(Kernel->ZeKernel, &ZeKernelProperties)); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_KERNEL_INFO_CONTEXT: - SET_PARAM_VALUE(pi_context{Kernel->Program->Context}); - break; + return ReturnValue(pi_context{Kernel->Program->Context}); case PI_KERNEL_INFO_PROGRAM: - SET_PARAM_VALUE(pi_program{Kernel->Program}); - break; + return ReturnValue(pi_program{Kernel->Program}); case PI_KERNEL_INFO_FUNCTION_NAME: - SET_PARAM_VALUE_STR(ZeKernelProperties.name); - break; + return ReturnValue(ZeKernelProperties.name); case PI_KERNEL_INFO_NUM_ARGS: - SET_PARAM_VALUE(pi_uint32{ZeKernelProperties.numKernelArgs}); - break; + return ReturnValue(pi_uint32{ZeKernelProperties.numKernelArgs}); case PI_KERNEL_INFO_REFERENCE_COUNT: - SET_PARAM_VALUE(pi_uint32{Kernel->RefCount}); - break; + return ReturnValue(pi_uint32{Kernel->RefCount}); case PI_KERNEL_INFO_ATTRIBUTES: try { uint32_t Size; @@ -2053,14 +2011,14 @@ pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, ZE_CALL(zeKernelGetAttribute(Kernel->ZeKernel, ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE, &Size, attributes)); - SET_PARAM_VALUE_STR(attributes); + auto Res = ReturnValue(attributes); delete[] attributes; + return Res; } catch (const std::bad_alloc &) { return PI_OUT_OF_HOST_MEMORY; } catch (...) { return PI_ERROR_UNKNOWN; } - break; default: zePrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n", ParamName, ParamName); @@ -2086,6 +2044,7 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, ZeKernelProperties.version = ZE_KERNEL_PROPERTIES_VERSION_CURRENT; ZE_CALL(zeKernelGetProperties(Kernel->ZeKernel, &ZeKernelProperties)); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { // TODO: To revisit after level_zero/issues/262 is resolved @@ -2094,15 +2053,13 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, } WorkSize = {{ZeDeviceComputeProperties.maxGroupSizeX, ZeDeviceComputeProperties.maxGroupSizeY, ZeDeviceComputeProperties.maxGroupSizeZ}}; - SET_PARAM_VALUE(WorkSize); - break; + return ReturnValue(WorkSize); } case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { uint32_t X, Y, Z; ZE_CALL(zeKernelSuggestGroupSize(Kernel->ZeKernel, 10000, 10000, 10000, &X, &Y, &Z)); - SET_PARAM_VALUE(size_t{X * Y * Z}); - break; + return ReturnValue(size_t{X * Y * Z}); } case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { struct { @@ -2110,28 +2067,24 @@ pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, } WgSize = {{ZeKernelProperties.requiredGroupSizeX, ZeKernelProperties.requiredGroupSizeY, ZeKernelProperties.requiredGroupSizeZ}}; - SET_PARAM_VALUE(WgSize); - break; + return ReturnValue(WgSize); } case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { // TODO: Assume 0 for now, replace with ze_kernel_properties_t::localMemSize // once released in RT. - SET_PARAM_VALUE(pi_uint32{0}); - break; + return ReturnValue(pi_uint32{0}); } case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { ze_device_properties_t ZeDeviceProperties; ZeDeviceProperties.version = ZE_DEVICE_PROPERTIES_VERSION_CURRENT; ZE_CALL(zeDeviceGetProperties(ZeDevice, &ZeDeviceProperties)); - SET_PARAM_VALUE(size_t{ZeDeviceProperties.physicalEUSimdWidth}); - break; + return ReturnValue(size_t{ZeDeviceProperties.physicalEUSimdWidth}); } case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: // TODO: Assume 0 for now, replace with // ze_kernel_properties_t::privateMemSize once released in RT. - SET_PARAM_VALUE(pi_uint32{0}); - break; + return ReturnValue(pi_uint32{0}); default: zePrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n", ParamName, ParamName); @@ -2301,32 +2254,29 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, size_t *ParamValueSizeRet) { assert(Event); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_EVENT_INFO_COMMAND_QUEUE: - SET_PARAM_VALUE(pi_queue{Event->Queue}); - break; + return ReturnValue(pi_queue{Event->Queue}); case PI_EVENT_INFO_CONTEXT: - SET_PARAM_VALUE(pi_context{Event->Queue->Context}); - break; + return ReturnValue(pi_context{Event->Queue->Context}); case PI_EVENT_INFO_COMMAND_TYPE: - SET_PARAM_VALUE(pi_cast(Event->CommandType)); - break; + return ReturnValue(pi_cast(Event->CommandType)); case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { ze_result_t ZeResult; ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus(Event->ZeEvent)); if (ZeResult == ZE_RESULT_SUCCESS) { - SET_PARAM_VALUE(pi_int32{CL_COMPLETE}); // Untie from OpenCL - } else { - // TODO: We don't know if the status is queueed, submitted or running. - // For now return "running", as others are unlikely to be of - // interest. - SET_PARAM_VALUE(pi_int32{CL_RUNNING}); + return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, + pi_int32{CL_COMPLETE}); // Untie from OpenCL } - break; + // TODO: We don't know if the status is queueed, submitted or running. + // For now return "running", as others are unlikely to be of + // interest. + return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, + pi_int32{CL_RUNNING}); } case PI_EVENT_INFO_REFERENCE_COUNT: - SET_PARAM_VALUE(pi_uint32{Event->RefCount}); - break; + return ReturnValue(pi_uint32{Event->RefCount}); default: zePrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n", ParamName, ParamName); @@ -2344,23 +2294,27 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, uint64_t ZeTimerResolution = Event->Queue->Context->Device->ZeDeviceProperties.timerResolution; - if (ParamName == PI_PROFILING_INFO_COMMAND_START) { + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + switch (ParamName) { + case PI_PROFILING_INFO_COMMAND_START: { uint64_t ContextStart; ZE_CALL(zeEventGetTimestamp( Event->ZeEvent, ZE_EVENT_TIMESTAMP_CONTEXT_START, &ContextStart)); ContextStart *= ZeTimerResolution; - SET_PARAM_VALUE(uint64_t{ContextStart}); - } else if (ParamName == PI_PROFILING_INFO_COMMAND_END) { + return ReturnValue(uint64_t{ContextStart}); + } + case PI_PROFILING_INFO_COMMAND_END: { uint64_t ContextEnd; ZE_CALL(zeEventGetTimestamp(Event->ZeEvent, ZE_EVENT_TIMESTAMP_CONTEXT_END, &ContextEnd)); ContextEnd *= ZeTimerResolution; - SET_PARAM_VALUE(uint64_t{ContextEnd}); - } else if (ParamName == PI_PROFILING_INFO_COMMAND_QUEUED || - ParamName == PI_PROFILING_INFO_COMMAND_SUBMIT) { + return ReturnValue(uint64_t{ContextEnd}); + } + case PI_PROFILING_INFO_COMMAND_QUEUED: + case PI_PROFILING_INFO_COMMAND_SUBMIT: // TODO: Support these when L0 supported is added. - SET_PARAM_VALUE(uint64_t{0}); - } else { + return ReturnValue(uint64_t{0}); + default: zePrint("piEventGetProfilingInfo: not supported ParamName\n"); return PI_INVALID_VALUE; } @@ -3589,6 +3543,7 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, Ptr, &ZeMemoryAllocationProperties, &ZeDeviceHandle)); + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_MEM_ALLOC_TYPE: { pi_usm_type MemAllocaType; @@ -3609,8 +3564,7 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, zePrint("piextUSMGetMemAllocInfo: unexpected usm memory type\n"); return PI_INVALID_VALUE; } - SET_PARAM_VALUE(MemAllocaType); - break; + return ReturnValue(MemAllocaType); } case PI_MEM_ALLOC_DEVICE: { // TODO: this wants pi_device, but we didn't remember it, and cannot @@ -3622,15 +3576,13 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, void *Base; ZE_CALL(zeDriverGetMemAddressRange(Context->Device->Platform->ZeDriver, Ptr, &Base, nullptr)); - SET_PARAM_VALUE(Base); - break; + return ReturnValue(Base); } case PI_MEM_ALLOC_SIZE: { size_t Size; ZE_CALL(zeDriverGetMemAddressRange(Context->Device->Platform->ZeDriver, Ptr, nullptr, &Size)); - SET_PARAM_VALUE(Size); - break; + return ReturnValue(Size); } default: zePrint("piextUSMGetMemAllocInfo: unsupported ParamName\n"); diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 80b2ea06a399b..21e217cf5ee35 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -212,12 +212,9 @@ bool findPlugins(vector_class> &PluginNames) { // search is done for libpi_opencl.so/pi_opencl.dll file in LD_LIBRARY_PATH // env only. // - PluginNames.emplace_back(std::make_pair( - OPENCL_PLUGIN_NAME, backend::opencl)); - PluginNames.emplace_back(std::make_pair( - LEVEL0_PLUGIN_NAME, backend::level0)); - PluginNames.emplace_back( - std::make_pair(CUDA_PLUGIN_NAME, backend::cuda)); + PluginNames.emplace_back(OPENCL_PLUGIN_NAME, backend::opencl); + PluginNames.emplace_back(LEVEL0_PLUGIN_NAME, backend::level0); + PluginNames.emplace_back(CUDA_PLUGIN_NAME, backend::cuda); return true; } From a1f740d37ba9a6d34cce1c9efc89f75239491720 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 27 May 2020 23:55:45 -0700 Subject: [PATCH 15/21] Apply suggestion for MaxNumEventsPerPool Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index c20a0d5d2a3b2..2760bcd219a0a 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -171,10 +171,11 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, // Maximum number of events that can be present in an event ZePool is captured // here. Setting it to 256 gave best possible performance for several // benchmarks. - static const char *MaxNumEventsPerPoolEnv = - std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); - static const pi_uint32 MaxNumEventsPerPool = - MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + static const pi_uint32 MaxNumEventsPerPool = [] { + const auto MaxNumEventsPerPoolEnv = + std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + return MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + }(); if (MaxNumEventsPerPool == 0) { zePrint("Zero size can't be specified in the " From 3d70ee39b3eddfd3a039c11bbcb5c06629709687 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 28 May 2020 12:43:23 -0700 Subject: [PATCH 16/21] Fix check for l0-loader target in cmake file Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index 62a6284571251..a6af23eba317b 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -77,7 +77,7 @@ else() ) endif() -if (l0-loader) +if (TARGET l0-loader) add_dependencies(pi_level0 l0-loader) endif() add_dependencies(sycl-toolchain pi_level0) From 8c75284ca2c4abca0d48a91f53d732056eae1b0b Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 28 May 2020 14:06:35 -0700 Subject: [PATCH 17/21] Minor fix Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 2760bcd219a0a..c0904759e773f 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -500,7 +500,7 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, std::call_once(OnceFlag, [&ZeResult]() { ZeResult = ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); }); - } catch (std::system_error &err) { + } catch (...) { // if any condition prevents calls to call_once from executing as // specified ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; From 6b02205ee5169cfab28b324f58954d4ba11d4135 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 29 May 2020 11:16:10 -0700 Subject: [PATCH 18/21] Remove segfault handling during zeInit. Handling of the segfault which could happen during zeInit should be handled using sigaction. Removed current implementation, new implementation will be added separately. Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 58 +++------------------------ 1 file changed, 5 insertions(+), 53 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index c0904759e773f..b634c3da0f067 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -432,39 +432,6 @@ void _pi_event::deleteZeEventList(ze_event_handle_t *ZeEventList) { delete[] ZeEventList; } -#ifndef _WIN32 -// Recover from Linux SIGSEGV signal. -// We can't reliably catch C++ exceptions thrown from signal -// handler so use setjmp/longjmp. -#include -#include -jmp_buf ReturnHere; -static void piSignalHandler(int SigNum) { - // We are somewhere the signall was raised, so go back to - // where we started tracking. - longjmp(ReturnHere, 0); -} -// Only handle segfault now, but can be extended. -#define __TRY() \ - signal(SIGSEGV, &piSignalHandler); \ - if (!setjmp(ReturnHere)) { -#define __CATCH() \ - } \ - else { -#define __FINALLY() \ - } \ - signal(SIGSEGV, SIG_DFL); - -#else // _WIN32 -// TODO: on Windows we could use structured exception handling. -// Just dummy implementation now (meaning no error handling). -#define __TRY() if (true) { -#define __CATCH() \ - } \ - else { -#define __FINALLY() } -#endif // _WIN32 - extern "C" { // Forward declararitons @@ -491,27 +458,12 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, return PI_INVALID_VALUE; } - ze_result_t ZeResult; // This is a good time to initialize L0. - // We can still safely recover if something goes wrong during the init. - __TRY() { - // We should not call zeInit multiple times ever. - try { - std::call_once(OnceFlag, [&ZeResult]() { - ZeResult = ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); - }); - } catch (...) { - // if any condition prevents calls to call_once from executing as - // specified - ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; - } - } - __CATCH() { - // SegFault = true; - zePrint("L0 raised segfault: assume no Platforms\n"); - ZeResult = ZE_RESULT_ERROR_UNINITIALIZED; - } - __FINALLY() + // TODO: We can still safely recover if something goes wrong during the init. + // Implement handling segfault using sigaction. + // TODO: We should not call zeInit multiples times ever, so + // this code should be changed. + ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit(ZE_INIT_FLAG_NONE)); // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { From aebdfe04c317e48663beec53a74c492451c97fbe Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 1 Jun 2020 10:19:46 -0700 Subject: [PATCH 19/21] Use unordered_map for errors mapping Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 60 ++++++++++----------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index b634c3da0f067..f69668f076b96 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -239,46 +239,30 @@ constexpr char ZE_SUPPORTED_EXTENSIONS[] = static pi_result mapError(ze_result_t ZeResult) { // TODO: these mapping need to be clarified and synced with the PI API return // values, which is TBD. - switch (ZeResult) { - case ZE_RESULT_SUCCESS: - return PI_SUCCESS; - case ZE_RESULT_ERROR_DEVICE_LOST: - return PI_DEVICE_NOT_FOUND; - case ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS: - return PI_INVALID_OPERATION; - case ZE_RESULT_ERROR_NOT_AVAILABLE: - return PI_INVALID_OPERATION; - case ZE_RESULT_ERROR_UNINITIALIZED: - return PI_INVALID_PLATFORM; - case ZE_RESULT_ERROR_INVALID_ARGUMENT: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_INVALID_NULL_POINTER: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_INVALID_SIZE: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_UNSUPPORTED_SIZE: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - return PI_INVALID_EVENT; - case ZE_RESULT_ERROR_INVALID_ENUMERATION: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return PI_INVALID_VALUE; - case ZE_RESULT_ERROR_INVALID_NATIVE_BINARY: - return PI_INVALID_BINARY; - case ZE_RESULT_ERROR_INVALID_KERNEL_NAME: - return PI_INVALID_KERNEL_NAME; - case ZE_RESULT_ERROR_INVALID_FUNCTION_NAME: - return PI_BUILD_PROGRAM_FAILURE; - case ZE_RESULT_ERROR_OVERLAPPING_REGIONS: - return PI_INVALID_OPERATION; - default: + std::unordered_map ErrorMapping = { + {ZE_RESULT_SUCCESS, PI_SUCCESS}, + {ZE_RESULT_ERROR_DEVICE_LOST, PI_DEVICE_NOT_FOUND}, + {ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, PI_INVALID_OPERATION}, + {ZE_RESULT_ERROR_NOT_AVAILABLE, PI_INVALID_OPERATION}, + {ZE_RESULT_ERROR_UNINITIALIZED, PI_INVALID_PLATFORM}, + {ZE_RESULT_ERROR_INVALID_ARGUMENT, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_INVALID_NULL_POINTER, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_INVALID_SIZE, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_UNSUPPORTED_SIZE, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, PI_INVALID_EVENT}, + {ZE_RESULT_ERROR_INVALID_ENUMERATION, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT, PI_INVALID_VALUE}, + {ZE_RESULT_ERROR_INVALID_NATIVE_BINARY, PI_INVALID_BINARY}, + {ZE_RESULT_ERROR_INVALID_KERNEL_NAME, PI_INVALID_KERNEL_NAME}, + {ZE_RESULT_ERROR_INVALID_FUNCTION_NAME, PI_BUILD_PROGRAM_FAILURE}, + {ZE_RESULT_ERROR_OVERLAPPING_REGIONS, PI_INVALID_OPERATION}}; + auto It = ErrorMapping.find(ZeResult); + if (It == ErrorMapping.end()) { return PI_ERROR_UNKNOWN; } + return It->second; } // Forward declarations From 3a92906b91b6854ed716904b7629b19b95e2251e Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 1 Jun 2020 15:54:41 -0700 Subject: [PATCH 20/21] Fixes to address fails after merge with master branch * Include Level Zero to pi unit testing * Provide correct flag during buffer creation in EnqueueMemTest unit test * Wait after piEnqueueMemBufferFill in EnqueueMemTest unit test because otherwise it is not guaranteed that buffer will be filled with data before reading to the host. * Event provided to pi call can be null if we don't want to track status of the enqueued command. Handle this case in L0 plugin. * Handle PI_DEVICE_TYPE_DEFAULT in L0 plugin properly Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 163 ++++++++++++++++---------- sycl/unittests/pi/BackendString.hpp | 3 +- sycl/unittests/pi/EnqueueMemTest.cpp | 14 ++- 3 files changed, 110 insertions(+), 70 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index f69668f076b96..624558e6a12d6 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -573,8 +573,9 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, // Get number of devices supporting L0 uint32_t ZeDeviceCount = 0; const bool AskingForGPU = (DeviceType & PI_DEVICE_TYPE_GPU); + const bool AskingForDefault = (DeviceType == PI_DEVICE_TYPE_DEFAULT); ZE_CALL(zeDeviceGet(ZeDriver, &ZeDeviceCount, nullptr)); - if (ZeDeviceCount == 0 || !AskingForGPU) { + if (ZeDeviceCount == 0 || !(AskingForGPU || AskingForDefault)) { if (NumDevices) *NumDevices = 0; return PI_SUCCESS; @@ -2122,15 +2123,18 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Kernel->Program->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Kernel->Program->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_NDRANGE_KERNEL; + (*Event)->ZeCommandList = ZeCommandList; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2176,6 +2180,7 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { ZE_CALL(zeEventCreate(ZeEventPool, &ZeEventDesc, &ZeEvent)); try { + assert(RetEvent); *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context, PI_COMMAND_TYPE_USER); } catch (const std::bad_alloc &) { @@ -2537,15 +2542,18 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = CommandType; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2592,15 +2600,18 @@ static pi_result enqueueMemCopyRectHelper( if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = CommandType; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2749,15 +2760,18 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = CommandType; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2824,13 +2838,18 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap, if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_MAP; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_MAP; + (*Event)->ZeCommandList = ZeCommandList; + + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2865,7 +2884,6 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap, RetMap)); } - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; ZE_CALL(zeCommandListAppendMemoryCopy( ZeCommandList, *RetMap, pi_cast(Buffer->getZeHandle()) + Offset, Size, ZeEvent)); @@ -2891,13 +2909,18 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr, // of unmap completion. assert(Event); - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_UNMAP; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_MEM_BUFFER_UNMAP; + (*Event)->ZeCommandList = ZeCommandList; + + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -2914,7 +2937,6 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr, if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo)) return Res; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; ZE_CALL(zeCommandListAppendMemoryCopy( ZeCommandList, pi_cast(MemObj->getZeHandle()) + MapInfo.Offset, MappedPtr, MapInfo.Size, ZeEvent)); @@ -2998,15 +3020,18 @@ enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, if (auto Res = Queue->Context->Device->createCommandList(&ZeCommandList)) return Res; - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; - (*Event)->Queue = Queue; - (*Event)->CommandType = CommandType; - (*Event)->ZeCommandList = ZeCommandList; + (*Event)->Queue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->ZeCommandList = ZeCommandList; - ze_event_handle_t ZeEvent = (*Event)->ZeEvent; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitList, EventWaitList); @@ -3349,13 +3374,18 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, return Res; // TODO: do we need to create a unique command type for this? - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_USER; + (*Event)->ZeCommandList = ZeCommandList; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_USER; - (*Event)->ZeCommandList = ZeCommandList; + ZeEvent = (*Event)->ZeEvent; + } ze_event_handle_t *ZeEventWaitList = _pi_event::createZeEventList(NumEventsInWaitlist, EventsWaitlist); @@ -3368,7 +3398,7 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, // TODO: L0 does not have a completion "event" with the prefetch API, // so manually add command to signal our event. - ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); + ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, ZeEvent)); if (auto Res = Queue->executeCommandList(ZeCommandList, false)) return Res; @@ -3433,20 +3463,25 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, return Res; // TODO: do we need to create a unique command type for this? - auto Res = piEventCreate(Queue->Context, Event); - if (Res != PI_SUCCESS) - return Res; + ze_event_handle_t ZeEvent = nullptr; + if (Event) { + auto Res = piEventCreate(Queue->Context, Event); + if (Res != PI_SUCCESS) + return Res; + + (*Event)->Queue = Queue; + (*Event)->CommandType = PI_COMMAND_TYPE_USER; + (*Event)->ZeCommandList = ZeCommandList; - (*Event)->Queue = Queue; - (*Event)->CommandType = PI_COMMAND_TYPE_USER; - (*Event)->ZeCommandList = ZeCommandList; + ZeEvent = (*Event)->ZeEvent; + } ZE_CALL(zeCommandListAppendMemAdvise( ZeCommandList, Queue->Context->Device->ZeDevice, Ptr, Length, ZeAdvice)); // TODO: L0 does not have a completion "event" with the advise API, // so manually add command to signal our event. - ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, (*Event)->ZeEvent)); + ZE_CALL(zeCommandListAppendSignalEvent(ZeCommandList, ZeEvent)); Queue->executeCommandList(ZeCommandList, false); return PI_SUCCESS; diff --git a/sycl/unittests/pi/BackendString.hpp b/sycl/unittests/pi/BackendString.hpp index ee3d212fc70ee..cea0eee8b8338 100644 --- a/sycl/unittests/pi/BackendString.hpp +++ b/sycl/unittests/pi/BackendString.hpp @@ -15,9 +15,10 @@ inline const char *GetBackendString(cl::sycl::backend backend) { PI_BACKEND_STR(cuda); PI_BACKEND_STR(host); PI_BACKEND_STR(opencl); + PI_BACKEND_STR(level0); #undef PI_BACKEND_STR default: return "Unknown Plugin"; } } -} // namespace pi \ No newline at end of file +} // namespace pi diff --git a/sycl/unittests/pi/EnqueueMemTest.cpp b/sycl/unittests/pi/EnqueueMemTest.cpp index 319b34cadcc28..7e98bab095d14 100644 --- a/sycl/unittests/pi/EnqueueMemTest.cpp +++ b/sycl/unittests/pi/EnqueueMemTest.cpp @@ -50,10 +50,11 @@ class EnqueueMemTest : public testing::TestWithParam { _context, _device, 0, &_queue)), PI_SUCCESS); - ASSERT_EQ((plugin.call_nocheck( - _context, 0, _numElementsX * _numElementsY * sizeof(pi_int32), - nullptr, &_mem)), - PI_SUCCESS); + ASSERT_EQ( + (plugin.call_nocheck( + _context, PI_MEM_FLAGS_ACCESS_RW, + _numElementsX * _numElementsY * sizeof(pi_int32), nullptr, &_mem)), + PI_SUCCESS); } void TearDown() override { @@ -84,9 +85,12 @@ class EnqueueMemTest : public testing::TestWithParam { 0, nullptr, nullptr)), PI_SUCCESS); + pi_event event; ASSERT_EQ((plugin.call_nocheck( _queue, _mem, &pattern, sizeof(T), 0, sizeof(inValues), 0, - nullptr, nullptr)), + nullptr, &event)), + PI_SUCCESS); + ASSERT_EQ((plugin.call_nocheck(1, &event)), PI_SUCCESS); T outValues[_numElementsX] = {}; From d2164f76e237649137f70051bb0a8cb24ca2eb04 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 2 Jun 2020 10:57:59 -0700 Subject: [PATCH 21/21] Make ErrorMapping static Signed-off-by: Artur Gainullin --- sycl/plugins/level_zero/pi_level0.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level0.cpp b/sycl/plugins/level_zero/pi_level0.cpp index 624558e6a12d6..6a313e754f1dc 100755 --- a/sycl/plugins/level_zero/pi_level0.cpp +++ b/sycl/plugins/level_zero/pi_level0.cpp @@ -239,7 +239,7 @@ constexpr char ZE_SUPPORTED_EXTENSIONS[] = static pi_result mapError(ze_result_t ZeResult) { // TODO: these mapping need to be clarified and synced with the PI API return // values, which is TBD. - std::unordered_map ErrorMapping = { + static std::unordered_map ErrorMapping = { {ZE_RESULT_SUCCESS, PI_SUCCESS}, {ZE_RESULT_ERROR_DEVICE_LOST, PI_DEVICE_NOT_FOUND}, {ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS, PI_INVALID_OPERATION},