Skip to content
Closed
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ _PI_API(piSamplerGetInfo)
_PI_API(piSamplerRetain)
_PI_API(piSamplerRelease)
// Queue commands
_PI_API(piKernelLaunch)
_PI_API(piEnqueueKernelLaunch)
_PI_API(piEnqueueNativeKernel)
_PI_API(piEnqueueEventsWait)
Expand Down
3 changes: 3 additions & 0 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE =
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE;
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT =
CL_QUEUE_ON_DEVICE_DEFAULT;
constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10;

using pi_result = _pi_result;
using pi_platform_info = _pi_platform_info;
Expand Down Expand Up @@ -1488,6 +1489,8 @@ __SYCL_EXPORT pi_result piSamplerRelease(pi_sampler sampler);
//
// Queue Commands
//
__SYCL_EXPORT pi_result piKernelLaunch(pi_queue queue);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have an idea for an alternative approach here that doesn't involve adding a new PI entry-point, we use piQueueFlush instead. As it already exists in PI with the semantics of starting execution of work lazily scheduled, assuming it works the same as clFlush. I'd then expect a flush to happen on event wait as well as queue wait.

piQueueFlush is also more generic than just for kernel execution commands. So if in the future we wanted more than kernel launch commands to be lazily executed, we wouldn't need to add another entry-point. e.g. could lazily enqueue piEnqueueMemBufferCopy commands and flush them. Rather than having to add a piMemBufferCopy entry-point to match the piEnqueueMemBufferCopy like we've done here with piEnqueueKernelLaunch.


__SYCL_EXPORT pi_result piEnqueueKernelLaunch(
pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/CL/sycl/detail/property_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ enum DataLessPropKind {
UseDefaultStream = 8,
DiscardEvents = 9,
DeviceReadOnly = 10,
LazyExecution = 11,
// Indicates the last known dataless property.
LastKnownDataLessPropKind = 10,
LastKnownDataLessPropKind = 11,
// Exceeding 32 may cause ABI breaking change on some of OSes.
DataLessPropKindSize = 32
};
Expand Down
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/feature_test.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ namespace sycl {
#define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1
#define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
#define SYCL_EXT_ONEAPI_GROUP_SORT 1
#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
#define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
#define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
#define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
Expand Down
8 changes: 8 additions & 0 deletions sycl/include/CL/sycl/properties/queue_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace property {
namespace queue {
class discard_events : public ::cl::sycl::detail::DataLessProperty<
::cl::sycl::detail::DiscardEvents> {};
class lazy_execution : public ::cl::sycl::detail::DataLessProperty<
::cl::sycl::detail::LazyExecution> {};
} // namespace queue
} // namespace property

Expand Down Expand Up @@ -63,6 +65,9 @@ template <>
struct is_property<ext::oneapi::property::queue::discard_events>
: std::true_type {};
template <>
struct is_property<ext::oneapi::property::queue::lazy_execution>
: std::true_type {};
template <>
struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
};
template <>
Expand All @@ -78,6 +83,9 @@ template <>
struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
: std::true_type {};
template <>
struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
: std::true_type {};
template <>
struct is_property_of<property::queue::cuda::use_default_stream, queue>
: std::true_type {};
template <>
Expand Down
223 changes: 223 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <CL/sycl/detail/defines_elementary.hpp>

#include <list>
#include <set>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace ext {
namespace oneapi {
namespace experimental {
namespace detail {

struct node_impl;

struct graph_impl;

using node_ptr = std::shared_ptr<node_impl>;

using graph_ptr = std::shared_ptr<graph_impl>;

class wrapper {
using T = std::function<void(sycl::handler &)>;
T my_func;
std::vector<sycl::event> my_deps;

public:
wrapper(T t, const std::vector<sycl::event> &deps)
: my_func(t), my_deps(deps){};

void operator()(sycl::handler &cgh) {
cgh.depends_on(my_deps);
std::invoke(my_func, cgh);
}
};

struct node_impl {
bool is_scheduled;

graph_ptr my_graph;
sycl::event my_event;

std::vector<node_ptr> my_successors;
std::vector<node_ptr> my_predecessors;

std::function<void(sycl::handler &)> my_body;

void exec(sycl::queue q) {
std::vector<sycl::event> __deps;
for (auto i : my_predecessors)
__deps.push_back(i->get_event());
my_event = q.submit(wrapper{my_body, __deps});
}

void register_successor(node_ptr n) {
my_successors.push_back(n);
n->register_predecessor(node_ptr(this));
}

void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }

sycl::event get_event(void) { return my_event; }

template <typename T>
node_impl(graph_ptr g, T cgf)
: is_scheduled(false), my_graph(g), my_body(cgf) {}

// Recursively adding nodes to execution stack:
void topology_sort(std::list<node_ptr> &schedule) {
is_scheduled = true;
for (auto i : my_successors) {
if (!i->is_scheduled)
i->topology_sort(schedule);
}
schedule.push_front(node_ptr(this));
}
};

struct graph_impl {
std::set<node_ptr> my_roots;
std::list<node_ptr> my_schedule;

graph_ptr parent;

void exec(sycl::queue q) {
if (my_schedule.empty()) {
for (auto n : my_roots) {
n->topology_sort(my_schedule);
}
}
for (auto n : my_schedule)
n->exec(q);
}

void exec_and_wait(sycl::queue q) {
exec(q);
q.wait();
}

void add_root(node_ptr n) {
my_roots.insert(n);
for (auto n : my_schedule)
n->is_scheduled = false;
my_schedule.clear();
}

void remove_root(node_ptr n) {
my_roots.erase(n);
for (auto n : my_schedule)
n->is_scheduled = false;
my_schedule.clear();
}

graph_impl() {}
};

} // namespace detail

class node;

class graph;

class executable_graph;

struct node {
// TODO: add properties to distinguish between empty, host, device nodes.
detail::node_ptr my_node;
detail::graph_ptr my_graph;

template <typename T>
node(detail::graph_ptr g, T cgf)
: my_graph(g), my_node(new detail::node_impl(g, cgf)){};
void register_successor(node n) { my_node->register_successor(n.my_node); }
void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }

void set_root() { my_graph->add_root(my_node); }

// TODO: Add query functions: is_root, ...
};

class executable_graph {
public:
int my_tag;
sycl::queue my_queue;

void exec_and_wait(); // { my_queue.wait(); }

executable_graph(detail::graph_ptr g, sycl::queue q)
: my_queue(q), my_tag(rand()) {
g->exec(my_queue);
}
};

class graph {
public:
// Adding empty node with [0..n] predecessors:
node add_empty_node(const std::vector<node> &dep = {});

// Adding node for host task
template <typename T>
node add_host_node(T hostTaskCallable, const std::vector<node> &dep = {});

// Adding device node:
template <typename T>
node add_device_node(T cgf, const std::vector<node> &dep = {});

// Adding dependency between two nodes.
void make_edge(node sender, node receiver);

// TODO: Extend queue to directly submit graph
void exec_and_wait(sycl::queue q);

executable_graph exec(sycl::queue q) {
return executable_graph{my_graph, q};
};

graph() : my_graph(new detail::graph_impl()) {}

// Creating a subgraph (with predecessors)
graph(graph &parent, const std::vector<node> &dep = {}) {}

bool is_subgraph();

private:
detail::graph_ptr my_graph;
};

void executable_graph::exec_and_wait() { my_queue.wait(); }

template <typename T>
node graph::add_device_node(T cgf, const std::vector<node> &dep) {
node _node(my_graph, cgf);
if (!dep.empty()) {
for (auto n : dep)
this->make_edge(n, _node);
} else {
_node.set_root();
}
return _node;
}

void graph::make_edge(node sender, node receiver) {
sender.register_successor(receiver); // register successor
my_graph->remove_root(receiver.my_node); // remove receiver from root node
// list
}

void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };

} // namespace experimental
} // namespace oneapi
} // namespace ext
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
Loading