Skip to content
Closed
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1acf57e
Inital version of sycl graph prototype
reble Feb 18, 2022
d286c71
Adding initial sycl graph doc
reble Feb 18, 2022
656f5c3
Adding lazy execution property to queue
reble Feb 15, 2022
0bad787
fix merge
reble Feb 22, 2022
a8b5b32
Update pi_level_zero.cpp
reble Feb 22, 2022
2b50af4
update extension proposal started to incorporate feedback
reble Mar 11, 2022
047839b
typo
reble Mar 11, 2022
f957996
fix typos and syntax issues
reble May 3, 2022
0d8a5f4
Apply suggestions from code review
reble Mar 14, 2022
50d49a1
Propagate lazy queue property
julianmi May 3, 2022
9b46c4b
fix formatting issues
reble May 6, 2022
7d81618
fix issue introd. by recent merge
reble May 6, 2022
7917086
fix formatting
reble May 10, 2022
a3164de
update API to recent proposal
reble Oct 12, 2022
8850b18
fix rebase issue
reble Oct 12, 2022
446ac53
revert changes to level-zero plugin
reble Oct 18, 2022
fa7494d
starting to rework lazy execution logic
reble Oct 18, 2022
7581915
bugfix
reble Oct 18, 2022
38da3c6
add basic tests
reble Oct 18, 2022
fa58aa3
renaming macro and bugfix
reble Oct 20, 2022
4478390
clang-format
reble Nov 1, 2022
383459c
Renaming variables
reble Nov 1, 2022
f71ea49
Common changes from record & replay API (#32)
EwanC Nov 21, 2022
df971e5
[SYCL] Minor graph classes refactor (#36)
Bensuo Nov 24, 2022
2cf9d0f
Cosmetic changes
reble Nov 30, 2022
9f127d7
[SYCL] Repro for reduction fail
Nov 18, 2022
578692f
[SYCL] PIMPL refactor
Nov 24, 2022
7bb11ce
[SYCL] Use handler to execute graph
Nov 30, 2022
3073cfc
[SYCL] Clean-up lazy queue PI changes
Dec 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE = (1 << 1);
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = (1 << 2);
constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT = (1 << 3);
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS = (1 << 4);
constexpr pi_queue_properties PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION = (1 << 11);

using pi_result = _pi_result;
using pi_platform_info = _pi_platform_info;
Expand Down
3 changes: 2 additions & 1 deletion sycl/include/sycl/detail/property_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ enum DataLessPropKind {
UseDefaultStream = 8,
DiscardEvents = 9,
DeviceReadOnly = 10,
LazyExecution = 11,
// Indicates the last known dataless property.
LastKnownDataLessPropKind = 10,
LastKnownDataLessPropKind = 11,
// Exceeding 32 may cause ABI breaking change on some of OSes.
DataLessPropKindSize = 32
};
Expand Down
222 changes: 222 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/graph.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <sycl/detail/defines_elementary.hpp>

#include <list>
#include <set>

//__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace ext {
namespace oneapi {
namespace experimental {
namespace detail {

struct node_impl;

struct graph_impl;

using node_ptr = std::shared_ptr<node_impl>;

using graph_ptr = std::shared_ptr<graph_impl>;

class wrapper {
using T = std::function<void(sycl::handler &)>;
T my_func;
std::vector<sycl::event> my_deps;

public:
wrapper(T t, const std::vector<sycl::event> &deps)
: my_func(t), my_deps(deps){};

void operator()(sycl::handler &cgh) {
cgh.depends_on(my_deps);
std::invoke(my_func, cgh);
}
};

struct node_impl {
bool is_scheduled;

graph_ptr my_graph;
sycl::event my_event;

std::vector<node_ptr> my_successors;
std::vector<node_ptr> my_predecessors;

std::function<void(sycl::handler &)> my_body;

void exec(sycl::queue q) {
std::vector<sycl::event> __deps;
for (auto i : my_predecessors)
__deps.push_back(i->get_event());
my_event = q.submit(wrapper{my_body, __deps});
}

void register_successor(node_ptr n) {
my_successors.push_back(n);
n->register_predecessor(node_ptr(this));
}

void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }

sycl::event get_event(void) { return my_event; }

template <typename T>
node_impl(graph_ptr g, T cgf)
: is_scheduled(false), my_graph(g), my_body(cgf) {}

// Recursively adding nodes to execution stack:
void topology_sort(std::list<node_ptr> &schedule) {
is_scheduled = true;
for (auto i : my_successors) {
if (!i->is_scheduled)
i->topology_sort(schedule);
}
schedule.push_front(node_ptr(this));
}
};

struct graph_impl {
std::set<node_ptr> my_roots;
std::list<node_ptr> my_schedule;
// TODO: Change one time initialization to per executable object
bool first;

graph_ptr parent;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't see this used anywhere

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intended use are nested graphs. This is WIP. I'll add a comment.


void exec(sycl::queue q) {
if (my_schedule.empty()) {
for (auto n : my_roots) {
n->topology_sort(my_schedule);
}
}
for (auto n : my_schedule)
n->exec(q);
}

void exec_and_wait(sycl::queue q) {
if(first) {
exec(q);
first=false;
}
q.wait();
}

void add_root(node_ptr n) {
my_roots.insert(n);
for (auto n : my_schedule)
n->is_scheduled = false;
my_schedule.clear();
}

void remove_root(node_ptr n) {
my_roots.erase(n);
for (auto n : my_schedule)
n->is_scheduled = false;
my_schedule.clear();
}

graph_impl() : first(true) {}
};

} // namespace detail

struct node {
detail::node_ptr my_node;
detail::graph_ptr my_graph;

template <typename T>
node(detail::graph_ptr g, T cgf)
: my_graph(g), my_node(new detail::node_impl(g, cgf)){};
void register_successor(node n) { my_node->register_successor(n.my_node); }
void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }

void set_root() { my_graph->add_root(my_node); }
};

enum class graph_state{
modifiable,
executable
};

template<graph_state State=graph_state::modifiable>
class command_graph {
public:
// Adding empty node with [0..n] predecessors:
node add(const std::vector<node> &dep = {});

// Adding device node:
template <typename T>
node add(T cgf, const std::vector<node> &dep = {});

// Adding dependency between two nodes.
void make_edge(node sender, node receiver);

// TODO: Extend queue to directly submit graph
void exec_and_wait(sycl::queue q);

command_graph<graph_state::executable> finalize(const sycl::context &syclContext) const;

command_graph() : my_graph(new detail::graph_impl()) {}

private:
detail::graph_ptr my_graph;
};

template<>
class command_graph<graph_state::executable>{
public:
int my_tag;
const sycl::context& my_ctx;

void exec_and_wait(sycl::queue q);

command_graph() = delete;

command_graph(detail::graph_ptr g, const sycl::context& ctx)
: my_graph(g) , my_ctx(ctx), my_tag(rand()) {}

private:
detail::graph_ptr my_graph;
};

template<> template<typename T>
node command_graph<graph_state::modifiable>::add(T cgf, const std::vector<node> &dep) {
node _node(my_graph, cgf);
if (!dep.empty()) {
for (auto n : dep)
this->make_edge(n, _node);
} else {
_node.set_root();
}
return _node;
}

template<>
void command_graph<graph_state::modifiable>::make_edge(node sender, node receiver) {
sender.register_successor(receiver); // register successor
my_graph->remove_root(receiver.my_node); // remove receiver from root node
// list
}

template<>
command_graph<graph_state::executable> command_graph<graph_state::modifiable>::finalize(const sycl::context &ctx) const {
return command_graph<graph_state::executable>{ this->my_graph, ctx };
}

void command_graph<graph_state::executable>::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };

} // namespace experimental
} // namespace oneapi
} // namespace ext
} // namespace sycl
//} // __SYCL_INLINE_NAMESPACE(cl)
1 change: 1 addition & 0 deletions sycl/include/sycl/feature_test.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
#define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1
#define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
#define SYCL_EXT_ONEAPI_GROUP_SORT 1
#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
#define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
#define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
#define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
Expand Down
8 changes: 8 additions & 0 deletions sycl/include/sycl/properties/queue_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ namespace property {
namespace queue {
class discard_events
: public ::sycl::detail::DataLessProperty<::sycl::detail::DiscardEvents> {};
class lazy_execution
: public ::sycl::detail::DataLessProperty<::sycl::detail::LazyExecution> {};
} // namespace queue
} // namespace property

Expand Down Expand Up @@ -65,6 +67,9 @@ template <>
struct is_property<ext::oneapi::property::queue::discard_events>
: std::true_type {};
template <>
struct is_property<ext::oneapi::property::queue::lazy_execution>
: std::true_type {};
template <>
struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
};
template <>
Expand All @@ -80,6 +85,9 @@ template <>
struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
: std::true_type {};
template <>
struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
: std::true_type {};
template <>
struct is_property_of<property::queue::cuda::use_default_stream, queue>
: std::true_type {};
template <>
Expand Down
70 changes: 69 additions & 1 deletion sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1306,6 +1306,45 @@ pi_result resetCommandLists(pi_queue Queue) {
pi_result _pi_context::getAvailableCommandList(
pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {

// This is a hack. TODO: Proper CommandList allocation per Executable Graph.
if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
// TODO: Create new Command List.
if(Queue->LazyCommandListMap.empty()) {
const bool UseCopyEngine = false;
// Adding createCommandList() to LazyCommandListMap
ze_fence_handle_t ZeFence;
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
ze_command_list_handle_t ZeCommandList;

uint32_t QueueGroupOrdinal;
auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
auto &ZeCommandQueue =
//ForcedCmdQueue ? *ForcedCmdQueue :
QGroup.getZeQueue(&QueueGroupOrdinal);
//if (ForcedCmdQueue)
// QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);

ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;

ZE_CALL(zeCommandListCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
&ZeCommandListDesc, &ZeCommandList));

ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
std::tie(CommandList, std::ignore) = Queue->LazyCommandListMap.insert(
std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
ZeCommandList, {ZeFence, false, ZeCommandQueue, QueueGroupOrdinal}));

Queue->insertActiveBarriers(CommandList, UseCopyEngine);
//
CommandList->second.ZeFenceInUse = true;
} else {
CommandList = Queue->LazyCommandListMap.begin();
}
return PI_SUCCESS;
}

// Immediate commandlists have been pre-allocated and are always available.
if (Queue->Device->useImmediateCommandLists()) {
CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
Expand Down Expand Up @@ -1544,6 +1583,9 @@ void _pi_queue::CaptureIndirectAccesses() {
pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
bool IsBlocking,
bool OKToBatchCommand) {
// When executing a Graph, defer execution
if( this->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) return PI_SUCCESS;

bool UseCopyEngine = CommandList->second.isCopy(this);

// If the current LastCommandEvent is the nullptr, then it means
Expand Down Expand Up @@ -3509,7 +3551,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
PI_QUEUE_ON_DEVICE_DEFAULT |
PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS)),
PI_EXT_ONEAPI_QUEUE_DISCARD_EVENTS |
PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION)),
PI_ERROR_INVALID_VALUE);

PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
Expand Down Expand Up @@ -3783,6 +3826,31 @@ pi_result piQueueFinish(pi_queue Queue) {
// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
// so this can be left as a no-op.
pi_result piQueueFlush(pi_queue Queue) {
if( Queue->Properties & PI_EXT_ONEAPI_QUEUE_LAZY_EXECUTION ) {
pi_command_list_ptr_t CommandList{};
// TODO:
CommandList = Queue->LazyCommandListMap.begin();

auto &ZeCommandQueue = CommandList->second.ZeQueue;
// Scope of the lock must be till the end of the function, otherwise new mem
// allocs can be created between the moment when we made a snapshot and the
// moment when command list is closed and executed. But mutex is locked only
// if indirect access tracking enabled, because std::defer_lock is used.
// unique_lock destructor at the end of the function will unlock the mutex
// if it was locked (which happens only if IndirectAccessTrackingEnabled is
// true).
std::unique_lock<pi_shared_mutex> ContextsLock(
Queue->Device->Platform->ContextsMutex, std::defer_lock);

// Close the command list and have it ready for dispatch.
ZE_CALL(zeCommandListClose, (CommandList->first));

// Offload command list to the GPU for asynchronous execution
auto ZeCommandList = CommandList->first;
auto ZeResult = ZE_CALL_NOCHECK(
zeCommandQueueExecuteCommandLists,
(ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
}
(void)Queue;
return PI_SUCCESS;
}
Expand Down
2 changes: 2 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,8 @@ struct _pi_queue : _pi_object {

// Map of all command lists used in this queue.
pi_command_list_map_t CommandListMap;
// TODO: Assign Graph related command lists to command_graph object
pi_command_list_map_t LazyCommandListMap;

// Helper data structure to hold all variables related to batching
typedef struct CommandBatch {
Expand Down
Loading