Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Our current prototype implementation can be found here:
[https://github.com/reble/llvm/tree/sycl-graph-develop](https://github.com/reble/llvm/tree/sycl-graph-develop).

Limitations include:
* LevelZero backend support only.
* LevelZero backend support only. A fallback emulation mode is used for correctness on other backends.
* Accessors and reductions are currently not supported.

### Other Material
Expand All @@ -34,11 +34,6 @@ TDB

See [Get Started Guide](./sycl/doc/GetStartedGuide.md).

SYCL Graph support is enabled with:
* Configuration script: `configure.py --enable-sycl-graph`.
* CMake: `cmake -DSYCL_ENABLE_GRAPH`.

A fallback emulation mode is used otherwise that enables the graph API but eagerly submits kernels.
### Report a problem

Submit an [issue](https://github.com/intel/llvm/issues) or initiate a
Expand Down
6 changes: 0 additions & 6 deletions buildbot/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ def do_configure(args):

if sys.platform != "darwin":
sycl_enabled_plugins.append("level_zero")

sycl_enable_graph = 'OFF'

# lld is needed on Windows or for the HIP plugin on AMD
if platform.system() == 'Windows' or (args.hip and args.hip_platform == 'AMD'):
Expand Down Expand Up @@ -110,9 +108,6 @@ def do_configure(args):

if args.use_lld:
llvm_enable_lld = 'ON'

if args.enable_sycl_graph:
sycl_enable_graph = 'ON'

# CI Default conditionally appends to options, keep it at the bottom of
# args handling
Expand Down Expand Up @@ -171,7 +166,6 @@ def do_configure(args):
"-DLLVM_ENABLE_SPHINX={}".format(llvm_enable_sphinx),
"-DBUILD_SHARED_LIBS={}".format(llvm_build_shared_libs),
"-DSYCL_ENABLE_XPTI_TRACING={}".format(sycl_enable_xpti_tracing),
"-DSYCL_ENABLE_GRAPH={}".format(sycl_enable_graph),
"-DLLVM_ENABLE_LLD={}".format(llvm_enable_lld),
"-DXPTI_ENABLE_WERROR={}".format(xpti_enable_werror),
"-DSYCL_CLANG_EXTRA_FLAGS={}".format(sycl_clang_extra_flags),
Expand Down
4 changes: 0 additions & 4 deletions sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,6 @@ endif()
# of the SYCL runtime and expect enabling
option(SYCL_ENABLE_XPTI_TRACING "Enable tracing of SYCL constructs" OFF)

# Create a soft option for enabling or disabling the experimental support
# for SYCl Graph
option(SYCL_ENABLE_GRAPH "Enable experimental SYCL Graph support" OFF)

if(MSVC)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
# Skip asynchronous C++ exceptions catching and assume "extern C" functions
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,8 @@ typedef enum {
PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 0x20006,
PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 0x20007,
PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 0x20008,
// Supports command-buffer extension entry-points
PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT = 0x20009,
} _pi_device_info;

typedef enum {
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2121,6 +2121,12 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
return PI_ERROR_INVALID_VALUE;

case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT: {
// Using CUDA-Graphs as a backend for PI command-buffers no yet supported
return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
false);
}

default:
__SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
}
Expand Down
3 changes: 3 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,9 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_2D)
CASE_PI_UNSUPPORTED(PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D)

case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT:
return ReturnValue(pi_bool{false});

default:
DIE_NO_IMPLEMENTATION;
}
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1984,6 +1984,12 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name,
PI_ERROR_INVALID_ARG_VALUE);
return PI_ERROR_PLUGIN_SPECIFIC_ERROR;

case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT {
// Using HIP-Graphs as a backend for PI command-buffers no yet supported
return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
false);
}

default:
__SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
}
Expand Down
9 changes: 9 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,15 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName,
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}

case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT: {
// Using cl_khr_command_buffer extensions as a backend for PI
// command-buffers no yet supported
cl_bool result = false;
std::memcpy(paramValue, &result, sizeof(cl_bool));
return PI_SUCCESS;
}

default:
cl_int result = clGetDeviceInfo(
cast<cl_device_id>(device), cast<cl_device_info>(paramName),
Expand Down
3 changes: 3 additions & 0 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,9 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
case PI_DEVICE_INFO_IMAGE_SRGB:
InfoType = (ur_device_info_t)UR_DEVICE_INFO_IMAGE_SRGB;
break;
case PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT:
InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_COMMAND_BUFFER_SUPPORT;
break;
case PI_DEVICE_INFO_BACKEND_VERSION: {
// TODO: return some meaningful for backend_version below
ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,8 @@ ur_result_t urDeviceGetInfo(
return ReturnValue(pi_bool{false});
case UR_DEVICE_INFO_IMAGE_SRGB:
return ReturnValue(pi_bool{false});
case UR_EXT_DEVICE_INFO_COMMAND_BUFFER_SUPPORT:
return ReturnValue(pi_bool{true});

// TODO: Implement.
default:
Expand Down
3 changes: 3 additions & 0 deletions sycl/plugins/unified_runtime/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ const int UR_EXT_DEVICE_INFO_FREE_MEMORY = UR_EXT_DEVICE_INFO_END - 13;
// const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE =
// UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE;
const int UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT = UR_EXT_DEVICE_INFO_END - 15;
// TODO Use UR extension detection once extension mechanism developed
const int UR_EXT_DEVICE_INFO_COMMAND_BUFFER_SUPPORT =
UR_EXT_DEVICE_INFO_END - 16;

const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
(ur_device_info_t)0x103D;
Expand Down
180 changes: 103 additions & 77 deletions sycl/source/detail/graph_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
#include <sycl/feature_test.hpp>
#include <sycl/queue.hpp>

// Developer switch to use emulation mode on all backends, even those that
// report native support, this is useful for debugging.
#define FORCE_EMULATION_MODE 0

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {

Expand Down Expand Up @@ -337,12 +341,14 @@ exec_graph_impl::~exec_graph_impl() {
for (auto Iter : MPiCommandBuffers) {
const sycl::detail::plugin &Plugin =
sycl::detail::getSyclObjImpl(MContext)->getPlugin();
auto CmdBuf = Iter.second;
pi_result Res =
Plugin.call_nocheck<sycl::detail::PiApiKind::piextCommandBufferRelease>(
CmdBuf);
(void)Res;
assert(Res == pi_result::PI_SUCCESS);
if (auto CmdBuf = Iter.second; CmdBuf) {
pi_result Res =
Plugin
.call_nocheck<sycl::detail::PiApiKind::piextCommandBufferRelease>(
CmdBuf);
(void)Res;
assert(Res == pi_result::PI_SUCCESS);
}
}
}

Expand All @@ -355,81 +361,82 @@ sycl::event exec_graph_impl::enqueue(
NewEvent->setStateIncomplete();
return NewEvent;
});
#if SYCL_EXT_ONEAPI_GRAPH
auto NewEvent = CreateNewEvent();
RT::PiEvent *OutEvent = &NewEvent->getHandleRef();
auto CommandBuffer = MPiCommandBuffers[Queue->get_device()];

// If we have no requirements for accessors for the command buffer, enqueue it
// directly
if (MRequirements.empty()) {
pi_result Res =
Queue->getPlugin()
.call_nocheck<sycl::detail::PiApiKind::piextEnqueueCommandBuffer>(
CommandBuffer, Queue->getHandleRef(), RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(
errc::event, "Failed to enqueue event for command buffer submission");
}
} else {
std::unique_ptr<sycl::detail::CG> CommandGroup =
std::make_unique<sycl::detail::CGExecCommandBuffer>(CommandBuffer,
MRequirements);

NewEvent = sycl::detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), Queue);
}

#else
std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
for (auto &NodeImpl : MSchedule) {
std::vector<RT::PiEvent> RawEvents;

// If the node has no requirements for accessors etc. then we skip the
// scheduler and enqueue directly.
if (NodeImpl->MCGType == sycl::detail::CG::Kernel &&
NodeImpl->MCommandGroup->MRequirements.size() +
static_cast<sycl::detail::CGExecKernel *>(
NodeImpl->MCommandGroup.get())
->MStreams.size() ==
0) {
sycl::detail::CGExecKernel *CG =
static_cast<sycl::detail::CGExecKernel *>(
NodeImpl->MCommandGroup.get());
auto NewEvent = CreateNewEvent();
RT::PiEvent *OutEvent = &NewEvent->getHandleRef();
pi_int32 Res =
sycl::
detail::enqueueImpKernel(Queue, CG->MNDRDesc, CG->MArgs,
nullptr /* TODO: Handle KernelBundles */,
CG->MSyclKernel, CG->MKernelName,
CG->MOSModuleHandle, RawEvents, OutEvent,
nullptr /* TODO: Pass mem allocation func
for accessors */
,
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT /* TODO: Extract from handler*/);
auto CommandBuffer = MPiCommandBuffers[Queue->get_device()];
sycl::detail::EventImplPtr NewEvent;

if (CommandBuffer) {
NewEvent = CreateNewEvent();
RT::PiEvent *OutEvent = &NewEvent->getHandleRef();

// If we have no requirements for accessors for the command buffer, enqueue
// it directly
if (MRequirements.empty()) {
pi_result Res =
Queue->getPlugin()
.call_nocheck<sycl::detail::PiApiKind::piextEnqueueCommandBuffer>(
CommandBuffer, Queue->getHandleRef(), RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0], OutEvent);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(
sycl::errc::kernel,
"Error during emulated graph command group submission.");
errc::event,
"Failed to enqueue event for command buffer submission");
}
ScheduledEvents.push_back(NewEvent);
} else {
std::unique_ptr<sycl::detail::CG> CommandGroup =
std::make_unique<sycl::detail::CGExecCommandBuffer>(CommandBuffer,
MRequirements);

sycl::detail::EventImplPtr EventImpl =
sycl::detail::Scheduler::getInstance().addCG(
std::move(NodeImpl->getCGCopy()), Queue);

ScheduledEvents.push_back(EventImpl);
NewEvent = sycl::detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), Queue);
}
} else {
std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
for (auto &NodeImpl : MSchedule) {
std::vector<RT::PiEvent> RawEvents;

// If the node has no requirements for accessors etc. then we skip the
// scheduler and enqueue directly.
if (NodeImpl->MCGType == sycl::detail::CG::Kernel &&
NodeImpl->MCommandGroup->MRequirements.size() +
static_cast<sycl::detail::CGExecKernel *>(
NodeImpl->MCommandGroup.get())
->MStreams.size() ==
0) {
sycl::detail::CGExecKernel *CG =
static_cast<sycl::detail::CGExecKernel *>(
NodeImpl->MCommandGroup.get());
NewEvent = CreateNewEvent();
RT::PiEvent *OutEvent = &NewEvent->getHandleRef();
pi_int32 Res = sycl::detail::enqueueImpKernel(
Queue, CG->MNDRDesc, CG->MArgs,
// TODO: Handler KernelBundles
nullptr, CG->MSyclKernel, CG->MKernelName, CG->MOSModuleHandle,
RawEvents, OutEvent,
// TODO: Pass accessor mem allocations
nullptr,
// TODO: Extract from handler
PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT);
if (Res != pi_result::PI_SUCCESS) {
throw sycl::exception(
sycl::errc::kernel,
"Error during emulated graph command group submission.");
}
ScheduledEvents.push_back(NewEvent);
} else {

sycl::detail::EventImplPtr EventImpl =
sycl::detail::Scheduler::getInstance().addCG(
std::move(NodeImpl->getCGCopy()), Queue);

ScheduledEvents.push_back(EventImpl);
}
}
// Create an event which has all kernel events as dependencies
NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
NewEvent->setStateIncomplete();
NewEvent->getPreparedDepsEvents() = ScheduledEvents;
}
// Create an event which has all kernel events as dependencies
sycl::detail::EventImplPtr NewEvent =
std::make_shared<sycl::detail::event_impl>(Queue);
NewEvent->setStateIncomplete();
NewEvent->getPreparedDepsEvents() = ScheduledEvents;
#endif

sycl::event QueueEvent =
sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
Expand Down Expand Up @@ -558,11 +565,30 @@ command_graph<graph_state::executable>::command_graph(
void command_graph<graph_state::executable>::finalize_impl() {
// Create PI command-buffers for each device in the finalized context
impl->schedule();
#if SYCL_EXT_ONEAPI_GRAPH
for (auto device : impl->get_context().get_devices()) {
impl->create_pi_command_buffers(device);
}

auto Context = impl->get_context();
for (auto Device : impl->get_context().get_devices()) {
pi_bool CmdBufSupport;

const sycl::detail::plugin &Plugin =
sycl::detail::getSyclObjImpl(Context)->getPlugin();

auto DeviceImpl = sycl::detail::getSyclObjImpl(Device);
Plugin.call<sycl::detail::PiApiKind::piDeviceGetInfo>(
DeviceImpl->getHandleRef(),
PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_SUPPORT, sizeof(pi_bool),
&CmdBufSupport, nullptr);

#if FORCE_EMULATION_MODE
// Above query should still succeed in emulation mode, but ignore the
// result and use emulation.
CmdBufSupport = false;
#endif

if (CmdBufSupport) {
impl->create_pi_command_buffers(Device);
}
}
}

void command_graph<graph_state::executable>::update(
Expand Down
3 changes: 0 additions & 3 deletions sycl/source/feature_test.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
#define SYCL_EXT_CODEPLAY_KERNEL_FUSION 1
#endif
#define SYCL_EXT_INTEL_CACHE_CONFIG 1
#cmakedefine01 SYCL_ENABLE_GRAPH
#if SYCL_ENABLE_GRAPH
#define SYCL_EXT_ONEAPI_GRAPH 1
#endif

#ifndef __has_include
#define __has_include(x) 0
Expand Down
3 changes: 1 addition & 2 deletions sycl/test-e2e/Graph/vendor_test_macro.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// REQUIRES: level_zero, gpu, TEMPORARY_DISABLED
// Disabled as emulation mode doesn't set macro
// REQUIRES: level_zero, gpu

// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
Expand Down