From 7c62056b70631eedda99b24c10595d2948ebdf40 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 18 Feb 2022 15:16:27 -0600
Subject: [PATCH 01/18] Inital version of sycl graph prototype

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/graph.hpp
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
new file mode 100644
index 0000000000000..08a7d094e9054
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -0,0 +1,212 @@
+//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/defines_elementary.hpp>
+
+#include <set>
+#include <list>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace detail {
+
+struct node_impl;
+
+struct graph_impl;
+
+using node_ptr = std::shared_ptr<node_impl>;
+
+using graph_ptr = std::shared_ptr<graph_impl>;
+
+class wrapper {
+    using T = std::function<void(sycl::handler&)>;
+    T my_func;
+    std::vector<sycl::event> my_deps;
+public:
+    wrapper(T t, const std::vector<sycl::event>& deps) : my_func(t), my_deps(deps) {};
+
+    void operator()(sycl::handler& cgh) {
+        cgh.depends_on(my_deps);
+        std::invoke(my_func,cgh);
+    }
+};
+
+struct node_impl {
+    bool is_scheduled;
+
+    graph_ptr my_graph;
+    sycl::event my_event;
+
+    std::vector<node_ptr> my_successors;
+    std::vector<node_ptr> my_predecessors;
+
+    std::function<void(sycl::handler&)> my_body;
+
+    void exec( sycl::queue q ) {
+        std::vector<sycl::event> __deps;
+        for(auto i:my_predecessors) __deps.push_back(i->get_event());
+        my_event = q.submit(wrapper{my_body,__deps});
+    }
+
+    void register_successor(node_ptr n) {
+        my_successors.push_back(n);
+        n->register_predecessor(node_ptr(this));
+    }
+
+    void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+
+    sycl::event get_event(void) {return my_event;}
+
+    template<typename T>
+    node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {}
+
+    // Recursively adding nodes to execution stack:
+    void topology_sort(std::list<node_ptr>& schedule) {
+        is_scheduled = true;
+        for(auto i:my_successors) {
+            if(!i->is_scheduled) i->topology_sort(schedule);
+        }
+        schedule.push_front(node_ptr(this));
+    }
+};
+
+struct graph_impl {
+    std::set<node_ptr> my_roots;
+    std::list<node_ptr> my_schedule;
+
+    graph_ptr parent;
+
+    void exec( sycl::queue q ) {
+        if( my_schedule.empty() ) {
+            for(auto n : my_roots) {
+                n->topology_sort(my_schedule);
+            }
+        }
+        for(auto n : my_schedule) n->exec(q);
+    }
+
+    void exec_and_wait( sycl::queue q ) {
+        exec(q);
+        q.wait();
+    }
+
+    void add_root(node_ptr n) {
+        my_roots.insert(n);
+        for(auto n : my_schedule) n->is_scheduled=false;
+        my_schedule.clear();
+    }
+
+    void remove_root(node_ptr n) {
+        my_roots.erase(n);
+        for(auto n : my_schedule) n->is_scheduled=false;
+        my_schedule.clear();
+    }
+
+    graph_impl() {}
+};
+
+} // namespace detail
+
+class node;
+
+class graph;
+
+class executable_graph;
+
+struct node {
+    // TODO: add properties to distinguish between empty, host, device nodes.
+    detail::node_ptr my_node;
+    detail::graph_ptr my_graph;
+
+    template<typename T>
+    node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {};
+    void register_successor(node n) { my_node->register_successor(n.my_node); }
+    void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); }
+
+    void set_root() {  my_graph->add_root(my_node);}
+
+    // TODO: Add query functions: is_root, ...
+};
+
+class executable_graph {
+public:
+    int my_tag;
+    sycl::queue my_queue;
+
+    void exec_and_wait();// { my_queue.wait(); }
+
+    executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) {
+      g->exec(my_queue);
+    }
+};
+
+class graph {
+public:
+    // Adding empty node with [0..n] predecessors:
+    node add_empty_node(const std::vector<node>& dep = {});
+
+    // Adding node for host task 
+    template<typename T>
+    node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});
+
+    // Adding device node:
+    template<typename T>
+    node add_device_node(T cgf, const std::vector<node>& dep = {});
+
+    // Adding dependency between two nodes.
+    void make_edge(node sender, node receiver);
+
+    // TODO: Extend queue to directly submit graph
+    void exec_and_wait( sycl::queue q );
+
+    executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};};
+
+    graph() : my_graph(new detail::graph_impl()) {}
+
+    // Creating a subgraph (with predecessors)
+    graph(graph& parent, const std::vector<node>& dep = {}) {}
+
+    bool is_subgraph();
+
+private:
+    detail::graph_ptr my_graph;
+};
+
+void executable_graph::exec_and_wait() { my_queue.wait(); }
+
+template<typename T>
+node graph::add_device_node(T cgf , const std::vector<node>& dep) {
+    node _node(my_graph,cgf);
+    if( !dep.empty() ) {
+        for(auto n : dep) this->make_edge(n,_node);
+    } else {
+        _node.set_root();
+    }
+    return _node;
+}
+
+void graph::make_edge(node sender, node receiver) {
+    sender.register_successor(receiver);//register successor
+    my_graph->remove_root(receiver.my_node); //remove receiver from root node list
+}
+
+void graph::exec_and_wait( sycl::queue q ) {
+    my_graph->exec_and_wait(q);
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+

From 59bb7da776178997cfe8953e39e1432639a02f8c Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 18 Feb 2022 15:15:10 -0600
Subject: [PATCH 02/18] Adding initial sycl graph doc

---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 290 ++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
new file mode 100644
index 0000000000000..3bb7051730b7d
--- /dev/null
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -0,0 +1,290 @@
+= SYCL_EXT_ONEAPI_GRAPH
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+== Notice
+
+Copyright (c) 2022 Intel Corporation.  All rights reserved.
+
+IMPORTANT: This specification is a draft.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+NOTE: This document is better viewed when rendered as html with asciidoctor.
+GitHub does not render image icons.
+
+This extension is written against the SYCL 2020 revision 4 specification. All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+NOTE: This extension is experimental: interfaces are subject to change later.
+
+== Introduction
+
+This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating
+Its definition and execution. 
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros".  Therefore, an
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_GRAPH` to one of the values defined in the table below.
+Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro.
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== SYCL Graph Terminology
+
+Table 2. Terminology.
+|===
+|Concept|Description
+|graph| Class that stores structured work units and their dependencies
+|node| The unit of work. Can have different attributes.
+|edge| Dependency between work units. Happens before relation.
+|===
+
+== Node
+
+Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution.
+A graph has to be created first, the structure of a graph is defined second by adding nodes and edges.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+	class node{
+	};
+}
+----
+
+NOTE:
+
+== Edge
+
+A dependency between two nodes representing a happens before relationship.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+	// Adding dependency between two nodes.
+    	void make_edge(node sender, node receiver);
+}
+----
+
+== Graph
+
+Graph is a class that represents a directed acyclic graph of nodes. 
+A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last.
+Member functions as listed in Table 2 and 3 can be used to add nodes to a graph.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+  class graph {
+  };
+
+}
+----
+
+=== Executable Graph
+
+`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. 
+The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed.
+Each `executable_graph` object can only be executed once at the same time on its assigned queue.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+  class executable_graph {
+  };
+
+}
+----
+
+
+Table 3. Constructors of the `graph` class.
+|===
+|Constructor|Description
+
+|`graph()`
+|Creates a `graph` object
+
+|`graph(graph& parent)`
+|Creates a nested `graph` object
+
+|===
+
+Table 4. Member functions of the `graph` class.
+|===
+|Member function|Description
+
+|`node add_empty_node(const std::vector<node>& dep = {});`
+|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) .
+
+|`template<typename T>
+    	node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});`
+|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime.
+
+|`template<typename T>
+    	node add_device_node(T cgf, const std::vector<node>& dep = {});`
+|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec.
+
+|`template<typename T>
+    	executable_graph make_executable(const queue& syclQueue);`
+|Returns a queue specific graph object that can be submitted to a queue.
+
+|`template<typename T>
+    	executable_graph make_executable(const device& syclDevice, const context& syclContext);`
+|Returns a device and context specific graph object that can be submitted to a queue.
+
+|===
+
+Table 5. Member functions of the `graph` class (memory operations).
+|===
+|Member function|Description
+
+|`node add_memcpy_node(void* dest, const void* src, size_t numBytes, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `memcpy` operation.
+
+|`node add_memset_node(void* ptr, int value, size_t numBytes, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `memset` operation.
+
+|`node add_malloc_node(void *data, size_t numBytes, usm::alloc kind, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `malloc` operation.
+
+|`node add_free_node(void *data, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `free` operation.
+
+|===
+
+
+== Examples
+
+1. Dot product
+
+[source,c++]
+----
+...
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+int main() {
+  const size_t n = 10;
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+#ifndef POC_IMPL
+  sycl::queue q;
+#else
+  sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}};
+  sycl::queue q{p};
+#endif
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  float *x = sycl::malloc_shared<float>(n, q);
+  float *y = sycl::malloc_shared<float>(n, q);
+  float *z = sycl::malloc_shared<float>(n, q);
+
+  float *dotp = sycl::malloc_shared<float>(1, q);
+
+  for (int i = 0; i < n; i++) {
+    x[i] = 1.0f;
+    y[i] = 2.0f;
+    z[i] = 3.0f;
+  }
+
+  auto node_a = g.add_device_node([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      x[i] = alpha * x[i] + beta * y[i];
+    });
+  });
+
+  auto node_b = g.add_device_node([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      z[i] = gamma * z[i] + beta * y[i];
+    });
+  });
+
+  auto node_c = g.add_device_node(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n},
+                       sycl::reduction(dotp, 0.0f, std::plus()),
+                       [=](sycl::id<1> it, auto &sum) {
+                         const size_t i = it[0];
+                         sum += x[i] * z[i];
+                       });
+      },
+      {node_a, node_b});
+
+  auto exec = g.make_exec(q);
+
+#ifndef POC_IMPL
+  q.submit(exec).wait();
+#else
+  exec.exec_and_wait();
+#endif
+
+  sycl::free(x, q);
+  sycl::free(y, q);
+  sycl::free(z, q);
+  sycl::free(dotp, q);
+
+  return 0;
+}
+
+
+...
+----
+
+== Issues for later investigations
+
+. Explicit memory movement can cause POC to stall.
+
+== Non-implemented features
+Please, note that the following features are not yet implemented:
+
+. Level Zero backend only
+. Memory operation nodes not implemented
+. Host node not implemented
+. Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2022-02-11|Pablo Reble|Initial public working draft
+|========================================

From 528017a77952d97bee1b4d5b3e723d3f6efb5412 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 7 Jan 2022 14:37:00 -0600
Subject: [PATCH 03/18] Reusing command list for re-execution (WIP)

---
 sycl/include/CL/sycl/detail/pi.def        |  1 +
 sycl/include/CL/sycl/detail/pi.h          |  2 +
 sycl/plugins/level_zero/pi_level_zero.cpp | 45 ++++++++++++++++++++++-
 sycl/plugins/level_zero/pi_level_zero.hpp |  2 +
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def
index c9a68c6cadec3..b26c7e5588867 100644
--- a/sycl/include/CL/sycl/detail/pi.def
+++ b/sycl/include/CL/sycl/detail/pi.def
@@ -103,6 +103,7 @@ _PI_API(piSamplerGetInfo)
 _PI_API(piSamplerRetain)
 _PI_API(piSamplerRelease)
 // Queue commands
+_PI_API(piKernelLaunch)
 _PI_API(piEnqueueKernelLaunch)
 _PI_API(piEnqueueNativeKernel)
 _PI_API(piEnqueueEventsWait)
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 4d3e841f731e1..4cfd760d7a8ab 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -1453,6 +1453,8 @@ __SYCL_EXPORT pi_result piSamplerRelease(pi_sampler sampler);
 //
 // Queue Commands
 //
+__SYCL_EXPORT pi_result piKernelLaunch(pi_queue queue);
+
 __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
     pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
     const size_t *global_work_offset, const size_t *global_work_size,
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 9e9b304bc84ed..61351246bf05c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1454,6 +1454,11 @@ pi_result _pi_queue::getOrCreateCopyCommandQueue(
   return PI_SUCCESS;
 }
 
+bool _pi_queue::isEagerExec() {
+  return false;
+  // (this->PiQueueProperties & (1<<5)) == 0)
+}
+
 // This function will return one of possibly multiple available copy queues.
 // Currently, a round robin strategy is used.
 // This function also sends back the value of CopyQueueIndex and
@@ -4773,7 +4778,7 @@ pi_result piKernelRelease(pi_kernel Kernel) {
 }
 
 pi_result
-piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkOffset,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
@@ -4906,14 +4911,52 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
   if (IndirectAccessTrackingEnabled)
     Queue->KernelsToBeSubmitted.push_back(Kernel);
 
+#if 0
   // Execute command list asynchronously, as the event will be used
   // to track down its completion.
   if (auto Res = Queue->executeCommandList(CommandList, false, true))
     return Res;
+#endif
 
   return PI_SUCCESS;
 }
 
+pi_result
+piKernelLaunch(pi_queue Queue) {
+    
+    //TODO: Make sure (re-)execute specific command list.
+
+    // Get a new command list to be used on this call
+      pi_command_list_ptr_t CommandList{};
+      if (auto Res = Queue->Context->getAvailableCommandList(
+              Queue, CommandList, false /* PreferCopyEngine */,
+              true /* AllowBatching */))
+        return Res;
+    
+    // Execute command list asynchronously, as the event will be used
+    // to track down its completion.
+    if (auto Res = Queue->executeCommandList(CommandList, false, true))
+      return Res;
+
+    return PI_SUCCESS;
+}
+
+pi_result
+piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+                      const size_t *GlobalWorkOffset,
+                      const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventWaitList, pi_event *Event) {
+    auto Res =
+    piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
+#if 0
+    if(Res == PI_SUCCESS && Queue->isEagerExec()) {
+        return piLazyKernelLaunch(Queue);
+    }
+#endif
+    return Res;
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 3bff3389cae1e..d73cf4f427fed 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -743,6 +743,8 @@ struct _pi_queue : _pi_object {
   // Returns true if the queue is a in-order queue.
   bool isInOrderQueue() const;
 
+  bool isEagerExec();
+
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
   // For copy commands, IsCopy is set to 'true'.

From aee48a541b7582725535d5ef852b385ee805ada7 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Tue, 15 Feb 2022 17:18:32 -0600
Subject: [PATCH 04/18] Adding lazy execution property to queue

---
 .../CL/sycl/detail/property_helper.hpp        |  3 +-
 sycl/include/CL/sycl/feature_test.hpp.in      |  1 +
 .../CL/sycl/properties/queue_properties.hpp   |  8 +++++
 sycl/plugins/level_zero/pi_level_zero.cpp     | 33 ++++++++++++++-----
 sycl/plugins/level_zero/pi_level_zero.hpp     | 11 +++++--
 sycl/source/detail/queue_impl.cpp             |  8 +++++
 6 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/property_helper.hpp b/sycl/include/CL/sycl/detail/property_helper.hpp
index 12bc497ee2a70..9dc34de890c69 100644
--- a/sycl/include/CL/sycl/detail/property_helper.hpp
+++ b/sycl/include/CL/sycl/detail/property_helper.hpp
@@ -34,8 +34,9 @@ enum DataLessPropKind {
   InitializeToIdentity = 7,
   UseDefaultStream = 8,
   DiscardEvents = 9,
+  LazyExecution = 10,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 9,
+  LastKnownDataLessPropKind = 10,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index 30164956bb0f0..c3444c94e98de 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -47,6 +47,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1
 #define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
 #define SYCL_EXT_ONEAPI_GROUP_SORT 1
+#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
 #define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
diff --git a/sycl/include/CL/sycl/properties/queue_properties.hpp b/sycl/include/CL/sycl/properties/queue_properties.hpp
index 76a3bfaea9373..c9fb6e88d890c 100644
--- a/sycl/include/CL/sycl/properties/queue_properties.hpp
+++ b/sycl/include/CL/sycl/properties/queue_properties.hpp
@@ -28,6 +28,8 @@ namespace property {
 namespace queue {
 class discard_events : public ::cl::sycl::detail::DataLessProperty<
                            ::cl::sycl::detail::DiscardEvents> {};
+class lazy_execution : public ::cl::sycl::detail::DataLessProperty<
+                           ::cl::sycl::detail::LazyExecution> {};
 } // namespace queue
 } // namespace property
 
@@ -63,6 +65,9 @@ template <>
 struct is_property<ext::oneapi::property::queue::discard_events>
     : std::true_type {};
 template <>
+struct is_property<ext::oneapi::property::queue::lazy_execution>
+    : std::true_type {};
+template <>
 struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
 };
 template <>
@@ -78,6 +83,9 @@ template <>
 struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
     : std::true_type {};
 template <>
+struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
+    : std::true_type {};
+template <>
 struct is_property_of<property::queue::cuda::use_default_stream, queue>
     : std::true_type {};
 template <>
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 61351246bf05c..a962f54f9980c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -838,6 +838,11 @@ bool _pi_queue::isInOrderQueue() const {
   return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0);
 }
 
+bool _pi_queue::isEagerExec() const {
+  // If lazy exec queue property is not set, then it's an eager queue.
+  return ((this->PiQueueProperties & (1<<10) ) == 0);
+}
+
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
                                       bool MakeAvailable) {
   bool UseCopyEngine = CommandList->second.isCopy();
@@ -1041,7 +1046,9 @@ _pi_queue::_pi_queue(ze_command_queue_handle_t Queue,
 pi_result
 _pi_context::getAvailableCommandList(pi_queue Queue,
                                      pi_command_list_ptr_t &CommandList,
-                                     bool UseCopyEngine, bool AllowBatching) {
+                                     bool UseCopyEngine, bool AllowBatching, bool Graph) {
+_pi_result pi_result = PI_OUT_OF_RESOURCES;
+if(!Graph && (Queue->CommandListMap.size() == 0)) {
   auto &CommandBatch =
       UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
   // Handle batching of commands
@@ -1064,7 +1071,7 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
   // the command lists, and later are then added to the command queue.
   // Each command list is paired with an associated fence to track when the
   // command list is available for reuse.
-  _pi_result pi_result = PI_OUT_OF_RESOURCES;
+  //_pi_result pi_result = PI_OUT_OF_RESOURCES;
   ZeStruct<ze_fence_desc_t> ZeFenceDesc;
 
   auto &ZeCommandListCache =
@@ -1172,6 +1179,11 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
     pi_result = PI_SUCCESS;
   }
 
+} else {
+  CommandList = Queue->CommandListMap.begin();
+  pi_result = PI_SUCCESS;
+}
+
   return pi_result;
 }
 
@@ -1230,7 +1242,7 @@ void _pi_queue::adjustBatchSizeForPartialBatch(bool IsCopy) {
 
 pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
                                         bool IsBlocking,
-                                        bool OKToBatchCommand) {
+                                        bool OKToBatchCommand, bool Graph) {
   int Index = CommandList->second.CopyQueueIndex;
   bool UseCopyEngine = (Index != -1);
   if (UseCopyEngine)
@@ -1374,6 +1386,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
   }
 
   // Close the command list and have it ready for dispatch.
+  // TODO: Close command list only once before initial execution, but works as is.
+  //if(!Graph)
   ZE_CALL(zeCommandListClose, (CommandList->first));
   // Offload command list to the GPU for asynchronous execution
   auto ZeCommandList = CommandList->first;
@@ -4923,19 +4937,22 @@ piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
 
 pi_result
 piKernelLaunch(pi_queue Queue) {
-    
+   
+    const bool Graph = !(Queue->isEagerExec());
+    //const bool Graph = true;
+ 
     //TODO: Make sure (re-)execute specific command list.
 
     // Get a new command list to be used on this call
       pi_command_list_ptr_t CommandList{};
       if (auto Res = Queue->Context->getAvailableCommandList(
               Queue, CommandList, false /* PreferCopyEngine */,
-              true /* AllowBatching */))
+              true /* AllowBatching */, Graph /* Shortcut for Graph */))
         return Res;
     
     // Execute command list asynchronously, as the event will be used
     // to track down its completion.
-    if (auto Res = Queue->executeCommandList(CommandList, false, true))
+    if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph))
       return Res;
 
     return PI_SUCCESS;
@@ -4949,9 +4966,9 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const pi_event *EventWaitList, pi_event *Event) {
     auto Res =
     piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
-#if 0
+#if 1
     if(Res == PI_SUCCESS && Queue->isEagerExec()) {
-        return piLazyKernelLaunch(Queue);
+        return piKernelLaunch(Queue);
     }
 #endif
     return Res;
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index d73cf4f427fed..9a294cb0ad649 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -541,6 +541,10 @@ struct _pi_context : _pi_object {
   std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
       ZeCopyCommandListCache;
 
+  // Single command list for graph api
+
+  std::list<ze_command_list_handle_t> ZeGraphCommandList;
+
   // Retrieves a command list for executing on this device along with
   // a fence to be used in tracking the execution of this command list.
   // If a command list has been created on this device which has
@@ -558,7 +562,8 @@ struct _pi_context : _pi_object {
   pi_result getAvailableCommandList(pi_queue Queue,
                                     pi_command_list_ptr_t &CommandList,
                                     bool UseCopyEngine = false,
-                                    bool AllowBatching = false);
+                                    bool AllowBatching = false,
+                                    bool Graph = false);
 
   // Get index of the free slot in the available pool. If there is no available
   // pool then create new one. The HostVisible parameter tells if we need a
@@ -743,7 +748,7 @@ struct _pi_queue : _pi_object {
   // Returns true if the queue is a in-order queue.
   bool isInOrderQueue() const;
 
-  bool isEagerExec();
+  bool isEagerExec() const;
 
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
@@ -782,7 +787,7 @@ struct _pi_queue : _pi_object {
   // of the value of OKToBatchCommand
   pi_result executeCommandList(pi_command_list_ptr_t CommandList,
                                bool IsBlocking = false,
-                               bool OKToBatchCommand = false);
+                               bool OKToBatchCommand = false, bool Graph = false);
 
   // If there is an open command list associated with this queue,
   // close it, execute it, and reset the corresponding OpenCommandList.
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index f786a87ef98c0..1fada9904c6cb 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -274,6 +274,14 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
 #endif
 
+#if 1
+  if(has_property<ext::oneapi::property::queue::lazy_execution>()){
+    const detail::plugin &Plugin = getPlugin();
+    if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
+        Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
+  }
+#endif
+
   std::vector<std::weak_ptr<event_impl>> WeakEvents;
   std::vector<event> SharedEvents;
   {

From 24fa5a9a24188dd31281b8e2bfb3f0be68b5cea4 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Mon, 21 Feb 2022 22:25:38 -0600
Subject: [PATCH 05/18] fix merge

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index a962f54f9980c..5d6ebb4046c57 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -840,7 +840,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->PiQueueProperties & (1<<10) ) == 0);
+  return ((this->Properties & (1<<10) ) == 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,

From b7ce27191f6d721997236584946456d6c9e8a347 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo@reble.org>
Date: Tue, 22 Feb 2022 10:46:54 -0600
Subject: [PATCH 06/18] Update pi_level_zero.cpp

Fix merge conflict
---
 sycl/plugins/level_zero/pi_level_zero.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 5d6ebb4046c57..fbf8f0ae69601 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1468,11 +1468,6 @@ pi_result _pi_queue::getOrCreateCopyCommandQueue(
   return PI_SUCCESS;
 }
 
-bool _pi_queue::isEagerExec() {
-  return false;
-  // (this->PiQueueProperties & (1<<5)) == 0)
-}
-
 // This function will return one of possibly multiple available copy queues.
 // Currently, a round robin strategy is used.
 // This function also sends back the value of CopyQueueIndex and

From f3d30edf7c555ef7b19b2505768c75cec9a6d23e Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 11 Mar 2022 16:42:43 +0100
Subject: [PATCH 07/18] update extension proposal started to incorporate
 feedback

---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 92 ++++++++-----------
 1 file changed, 38 insertions(+), 54 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 3bb7051730b7d..efe81d24b767d 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -69,15 +69,15 @@ Table 2. Terminology.
 
 == Node
 
-Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution.
+Node is a class that encapsulates tasks like SYCL kernel functions or host tasks for deferred execution.
 A graph has to be created first, the structure of a graph is defined second by adding nodes and edges.
 
 [source,c++]
 ----
 namespace sycl::ext::oneapi::experimental {
 
-	class node{
-	};
+  class node{
+  };
 }
 ----
 
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship.
+A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs.
 
 [source,c++]
 ----
@@ -99,45 +99,46 @@ namespace sycl::ext::oneapi::experimental {
 == Graph
 
 Graph is a class that represents a directed acyclic graph of nodes. 
-A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last.
+A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed.
 Member functions as listed in Table 2 and 3 can be used to add nodes to a graph.
 
 [source,c++]
 ----
 namespace sycl::ext::oneapi::experimental {
 
-  class graph {
+  enum class graph_state{
+    modifiable,
+    executable
   };
 
+  template<graph_state State>
+  class graph {
+  public:
+    operator graph<graph_state::executable>();
+  };
+  
+  graph<graph_state::modifiable> make_graph();
+  
+  graph<graph_state::modifiable> compile(const graph<graph_state::executable> Graph);
+  
 }
-----
 
-=== Executable Graph
+sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 
-`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. 
-The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed.
-Each `executable_graph` object can only be executed once at the same time on its assigned queue.
-
-[source,c++]
 ----
-namespace sycl::ext::oneapi::experimental {
-
-  class executable_graph {
-  };
 
-}
-----
+=== Executable Graph
 
+A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
+The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
+Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
-Table 3. Constructors of the `graph` class.
+Table 3. Constructor of the `graph` class.
 |===
 |Constructor|Description
 
 |`graph()`
-|Creates a `graph` object
-
-|`graph(graph& parent)`
-|Creates a nested `graph` object
+|Creates a `graph` object. It's default state is `graph_state::modifiable`.
 
 |===
 
@@ -145,24 +146,12 @@ Table 4. Member functions of the `graph` class.
 |===
 |Member function|Description
 
-|`node add_empty_node(const std::vector<node>& dep = {});`
-|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) .
-
-|`template<typename T>
-    	node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});`
-|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime.
+|`node add_node(const std::vector<node>& dep = {});`
+|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
 
 |`template<typename T>
-    	node add_device_node(T cgf, const std::vector<node>& dep = {});`
-|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec.
-
-|`template<typename T>
-    	executable_graph make_executable(const queue& syclQueue);`
-|Returns a queue specific graph object that can be submitted to a queue.
-
-|`template<typename T>
-    	executable_graph make_executable(const device& syclDevice, const context& syclContext);`
-|Returns a device and context specific graph object that can be submitted to a queue.
+    	node add_node(T cgf, const std::vector<node>& dep = {});`
+|This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec.
 
 |===
 
@@ -187,6 +176,8 @@ Table 5. Member functions of the `graph` class (memory operations).
 
 == Examples
 
+NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document.
+
 1. Dot product
 
 [source,c++]
@@ -201,14 +192,9 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-#ifndef POC_IMPL
   sycl::queue q;
-#else
-  sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}};
-  sycl::queue q{p};
-#endif
 
-  sycl::ext::oneapi::experimental::graph g;
+  auto g = sycl::ext::oneapi::experimental::make_graph();
 
   float *x = sycl::malloc_shared<float>(n, q);
   float *y = sycl::malloc_shared<float>(n, q);
@@ -222,21 +208,21 @@ int main() {
     z[i] = 3.0f;
   }
 
-  auto node_a = g.add_device_node([&](sycl::handler &h) {
+  auto node_a = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       x[i] = alpha * x[i] + beta * y[i];
     });
   });
 
-  auto node_b = g.add_device_node([&](sycl::handler &h) {
+  auto node_b = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       z[i] = gamma * z[i] + beta * y[i];
     });
   });
 
-  auto node_c = g.add_device_node(
+  auto node_c = g.add_node(
       [&](sycl::handler &h) {
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
@@ -247,13 +233,9 @@ int main() {
       },
       {node_a, node_b});
 
-  auto exec = g.make_exec(q);
+  auto exec = compile(q);
 
-#ifndef POC_IMPL
   q.submit(exec).wait();
-#else
-  exec.exec_and_wait();
-#endif
 
   sycl::free(x, q);
   sycl::free(y, q);
@@ -278,6 +260,7 @@ Please, note that the following features are not yet implemented:
 . Memory operation nodes not implemented
 . Host node not implemented
 . Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead.
+. `class graph<graph_state>` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead.
 
 == Revision History
 
@@ -287,4 +270,5 @@ Please, note that the following features are not yet implemented:
 |========================================
 |Rev|Date|Author|Changes
 |1|2022-02-11|Pablo Reble|Initial public working draft
+|2|2022-03-11|Pablo Reble|Incorporate feedback from PR
 |========================================

From 0e96d12ff3d438abeeab69a325cc9dbee4578b5d Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 11 Mar 2022 20:47:16 +0100
Subject: [PATCH 08/18] typo

---
 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index efe81d24b767d..28e1d78f5de1b 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs.
+A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs.
 
 [source,c++]
 ----

From 8f1a8dc559d7e3f592c8bcd1b089e9ce6a82a27d Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo@reble.org>
Date: Mon, 14 Mar 2022 14:08:02 +0100
Subject: [PATCH 09/18] Apply suggestions from code review

Co-authored-by: Ronan Keryell <ronan@keryell.fr>
---
 .../extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 28e1d78f5de1b..960d55e7c282f 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -129,7 +129,7 @@ sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 
 === Executable Graph
 
-A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
+A `graph` object in `graph_state::executable` represents a user-generated device and context specific execution object that is submitted to a queue for execution.
 The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
 Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
@@ -147,7 +147,7 @@ Table 4. Member functions of the `graph` class.
 |Member function|Description
 
 |`node add_node(const std::vector<node>& dep = {});`
-|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
+|This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
 
 |`template<typename T>
     	node add_node(T cgf, const std::vector<node>& dep = {});`

From a8c72654e7ab1ab34a7da1d0c775c0667b2eb0dd Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Mon, 2 May 2022 21:06:42 -0500
Subject: [PATCH 10/18] fix typos and syntax issues

---
 .../experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 28e1d78f5de1b..4b0a5ea805d35 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -38,7 +38,7 @@ NOTE: This extension is experimental: interfaces are subject to change later.
 == Introduction
 
 This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating
-Its definition and execution. 
+its definition and execution.
 
 == Feature test macro
 
@@ -64,7 +64,7 @@ Table 2. Terminology.
 |Concept|Description
 |graph| Class that stores structured work units and their dependencies
 |node| The unit of work. Can have different attributes.
-|edge| Dependency between work units. Happens before relation.
+|edge| Dependency between work units. Happens-before relation.
 |===
 
 == Node
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs.
+A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs.
 
 [source,c++]
 ----
@@ -119,7 +119,7 @@ namespace sycl::ext::oneapi::experimental {
   
   graph<graph_state::modifiable> make_graph();
   
-  graph<graph_state::modifiable> compile(const graph<graph_state::executable> Graph);
+  graph<graph_state::executable> compile(const graph<graph_state::modifiable> Graph);
   
 }
 
@@ -130,7 +130,7 @@ sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 === Executable Graph
 
 A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
-The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
+The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node.
 Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
 Table 3. Constructor of the `graph` class.

From 60507c10acf56c5624a41f95b4f7637d9b046cb1 Mon Sep 17 00:00:00 2001
From: Julian Miller <julian.miller@intel.com>
Date: Tue, 3 May 2022 11:29:34 -0500
Subject: [PATCH 11/18] Propagate lazy queue property

---
 sycl/include/CL/sycl/detail/pi.h          | 1 +
 sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++----
 sycl/source/detail/queue_impl.hpp         | 3 +++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 4cfd760d7a8ab..2eeadb295d086 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -602,6 +602,7 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE =
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE;
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT =
     CL_QUEUE_ON_DEVICE_DEFAULT;
+constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10;
 
 using pi_result = _pi_result;
 using pi_platform_info = _pi_platform_info;
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index fbf8f0ae69601..c4d30a3c65d96 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3088,10 +3088,11 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
                         pi_queue_properties Properties, pi_queue *Queue) {
 
   // Check that unexpected bits are not set.
-  PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
-                             PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
-                             PI_QUEUE_ON_DEVICE_DEFAULT)),
-            PI_INVALID_VALUE);
+  PI_ASSERT(
+      !(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                       PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
+                       PI_QUEUE_ON_DEVICE_DEFAULT | PI_QUEUE_LAZY_EXECUTION)),
+      PI_INVALID_VALUE);
 
   ze_device_handle_t ZeDevice;
   ze_command_queue_handle_t ZeComputeCommandQueue;
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 1d4d66a2b8b20..89a1c09c66de7 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -305,6 +305,9 @@ class queue_impl {
             ext::oneapi::cuda::property::queue::use_default_stream>()) {
       CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
     }
+    if (has_property<ext::oneapi::property::queue::lazy_execution>()) {
+      CreationFlags |= PI_QUEUE_LAZY_EXECUTION;
+    }
     RT::PiQueue Queue{};
     RT::PiContext Context = MContext->getHandleRef();
     RT::PiDevice Device = MDevice->getHandleRef();

From 9209b57584dd4ad2bfcb060c726755f34bff8420 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 6 May 2022 10:30:29 -0500
Subject: [PATCH 12/18] fix formatting issues

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 235 +++++++++---------
 sycl/plugins/level_zero/pi_level_zero.cpp     |  92 +++----
 sycl/plugins/level_zero/pi_level_zero.hpp     |   3 +-
 3 files changed, 171 insertions(+), 159 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 08a7d094e9054..2d61c03be02e6 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -10,8 +10,8 @@
 
 #include <CL/sycl/detail/defines_elementary.hpp>
 
-#include <set>
 #include <list>
+#include <set>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -29,90 +29,98 @@ using node_ptr = std::shared_ptr<node_impl>;
 using graph_ptr = std::shared_ptr<graph_impl>;
 
 class wrapper {
-    using T = std::function<void(sycl::handler&)>;
-    T my_func;
-    std::vector<sycl::event> my_deps;
+  using T = std::function<void(sycl::handler &)>;
+  T my_func;
+  std::vector<sycl::event> my_deps;
+
 public:
-    wrapper(T t, const std::vector<sycl::event>& deps) : my_func(t), my_deps(deps) {};
+  wrapper(T t, const std::vector<sycl::event> &deps)
+      : my_func(t), my_deps(deps){};
 
-    void operator()(sycl::handler& cgh) {
-        cgh.depends_on(my_deps);
-        std::invoke(my_func,cgh);
-    }
+  void operator()(sycl::handler &cgh) {
+    cgh.depends_on(my_deps);
+    std::invoke(my_func, cgh);
+  }
 };
 
 struct node_impl {
-    bool is_scheduled;
+  bool is_scheduled;
 
-    graph_ptr my_graph;
-    sycl::event my_event;
+  graph_ptr my_graph;
+  sycl::event my_event;
 
-    std::vector<node_ptr> my_successors;
-    std::vector<node_ptr> my_predecessors;
+  std::vector<node_ptr> my_successors;
+  std::vector<node_ptr> my_predecessors;
 
-    std::function<void(sycl::handler&)> my_body;
+  std::function<void(sycl::handler &)> my_body;
 
-    void exec( sycl::queue q ) {
-        std::vector<sycl::event> __deps;
-        for(auto i:my_predecessors) __deps.push_back(i->get_event());
-        my_event = q.submit(wrapper{my_body,__deps});
-    }
+  void exec(sycl::queue q) {
+    std::vector<sycl::event> __deps;
+    for (auto i : my_predecessors)
+      __deps.push_back(i->get_event());
+    my_event = q.submit(wrapper{my_body, __deps});
+  }
 
-    void register_successor(node_ptr n) {
-        my_successors.push_back(n);
-        n->register_predecessor(node_ptr(this));
-    }
+  void register_successor(node_ptr n) {
+    my_successors.push_back(n);
+    n->register_predecessor(node_ptr(this));
+  }
 
-    void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+  void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
 
-    sycl::event get_event(void) {return my_event;}
+  sycl::event get_event(void) { return my_event; }
 
-    template<typename T>
-    node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {}
+  template <typename T>
+  node_impl(graph_ptr g, T cgf)
+      : is_scheduled(false), my_graph(g), my_body(cgf) {}
 
-    // Recursively adding nodes to execution stack:
-    void topology_sort(std::list<node_ptr>& schedule) {
-        is_scheduled = true;
-        for(auto i:my_successors) {
-            if(!i->is_scheduled) i->topology_sort(schedule);
-        }
-        schedule.push_front(node_ptr(this));
+  // Recursively adding nodes to execution stack:
+  void topology_sort(std::list<node_ptr> &schedule) {
+    is_scheduled = true;
+    for (auto i : my_successors) {
+      if (!i->is_scheduled)
+        i->topology_sort(schedule);
     }
+    schedule.push_front(node_ptr(this));
+  }
 };
 
 struct graph_impl {
-    std::set<node_ptr> my_roots;
-    std::list<node_ptr> my_schedule;
-
-    graph_ptr parent;
-
-    void exec( sycl::queue q ) {
-        if( my_schedule.empty() ) {
-            for(auto n : my_roots) {
-                n->topology_sort(my_schedule);
-            }
-        }
-        for(auto n : my_schedule) n->exec(q);
-    }
+  std::set<node_ptr> my_roots;
+  std::list<node_ptr> my_schedule;
 
-    void exec_and_wait( sycl::queue q ) {
-        exec(q);
-        q.wait();
-    }
+  graph_ptr parent;
 
-    void add_root(node_ptr n) {
-        my_roots.insert(n);
-        for(auto n : my_schedule) n->is_scheduled=false;
-        my_schedule.clear();
+  void exec(sycl::queue q) {
+    if (my_schedule.empty()) {
+      for (auto n : my_roots) {
+        n->topology_sort(my_schedule);
+      }
     }
-
-    void remove_root(node_ptr n) {
-        my_roots.erase(n);
-        for(auto n : my_schedule) n->is_scheduled=false;
-        my_schedule.clear();
-    }
-
-    graph_impl() {}
+    for (auto n : my_schedule)
+      n->exec(q);
+  }
+
+  void exec_and_wait(sycl::queue q) {
+    exec(q);
+    q.wait();
+  }
+
+  void add_root(node_ptr n) {
+    my_roots.insert(n);
+    for (auto n : my_schedule)
+      n->is_scheduled = false;
+    my_schedule.clear();
+  }
+
+  void remove_root(node_ptr n) {
+    my_roots.erase(n);
+    for (auto n : my_schedule)
+      n->is_scheduled = false;
+    my_schedule.clear();
+  }
+
+  graph_impl() {}
 };
 
 } // namespace detail
@@ -124,89 +132,92 @@ class graph;
 class executable_graph;
 
 struct node {
-    // TODO: add properties to distinguish between empty, host, device nodes.
-    detail::node_ptr my_node;
-    detail::graph_ptr my_graph;
+  // TODO: add properties to distinguish between empty, host, device nodes.
+  detail::node_ptr my_node;
+  detail::graph_ptr my_graph;
 
-    template<typename T>
-    node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {};
-    void register_successor(node n) { my_node->register_successor(n.my_node); }
-    void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); }
+  template <typename T>
+  node(detail::graph_ptr g, T cgf)
+      : my_graph(g), my_node(new detail::node_impl(g, cgf)){};
+  void register_successor(node n) { my_node->register_successor(n.my_node); }
+  void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }
 
-    void set_root() {  my_graph->add_root(my_node);}
+  void set_root() { my_graph->add_root(my_node); }
 
-    // TODO: Add query functions: is_root, ...
+  // TODO: Add query functions: is_root, ...
 };
 
 class executable_graph {
 public:
-    int my_tag;
-    sycl::queue my_queue;
+  int my_tag;
+  sycl::queue my_queue;
 
-    void exec_and_wait();// { my_queue.wait(); }
+  void exec_and_wait(); // { my_queue.wait(); }
 
-    executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) {
-      g->exec(my_queue);
-    }
+  executable_graph(detail::graph_ptr g, sycl::queue q)
+      : my_queue(q), my_tag(rand()) {
+    g->exec(my_queue);
+  }
 };
 
 class graph {
 public:
-    // Adding empty node with [0..n] predecessors:
-    node add_empty_node(const std::vector<node>& dep = {});
+  // Adding empty node with [0..n] predecessors:
+  node add_empty_node(const std::vector<node> &dep = {});
 
-    // Adding node for host task 
-    template<typename T>
-    node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});
+  // Adding node for host task
+  template <typename T>
+  node add_host_node(T hostTaskCallable, const std::vector<node> &dep = {});
 
-    // Adding device node:
-    template<typename T>
-    node add_device_node(T cgf, const std::vector<node>& dep = {});
+  // Adding device node:
+  template <typename T>
+  node add_device_node(T cgf, const std::vector<node> &dep = {});
 
-    // Adding dependency between two nodes.
-    void make_edge(node sender, node receiver);
+  // Adding dependency between two nodes.
+  void make_edge(node sender, node receiver);
 
-    // TODO: Extend queue to directly submit graph
-    void exec_and_wait( sycl::queue q );
+  // TODO: Extend queue to directly submit graph
+  void exec_and_wait(sycl::queue q);
 
-    executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};};
+  executable_graph exec(sycl::queue q) {
+    return executable_graph{my_graph, q};
+  };
 
-    graph() : my_graph(new detail::graph_impl()) {}
+  graph() : my_graph(new detail::graph_impl()) {}
 
-    // Creating a subgraph (with predecessors)
-    graph(graph& parent, const std::vector<node>& dep = {}) {}
+  // Creating a subgraph (with predecessors)
+  graph(graph &parent, const std::vector<node> &dep = {}) {}
 
-    bool is_subgraph();
+  bool is_subgraph();
 
 private:
-    detail::graph_ptr my_graph;
+  detail::graph_ptr my_graph;
 };
 
 void executable_graph::exec_and_wait() { my_queue.wait(); }
 
-template<typename T>
-node graph::add_device_node(T cgf , const std::vector<node>& dep) {
-    node _node(my_graph,cgf);
-    if( !dep.empty() ) {
-        for(auto n : dep) this->make_edge(n,_node);
-    } else {
-        _node.set_root();
-    }
-    return _node;
+template <typename T>
+node graph::add_device_node(T cgf, const std::vector<node> &dep) {
+  node _node(my_graph, cgf);
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, _node);
+  } else {
+    _node.set_root();
+  }
+  return _node;
 }
 
 void graph::make_edge(node sender, node receiver) {
-    sender.register_successor(receiver);//register successor
-    my_graph->remove_root(receiver.my_node); //remove receiver from root node list
+  sender.register_successor(receiver);     // register successor
+  my_graph->remove_root(receiver.my_node); // remove receiver from root node
+                                           // list
 }
 
-void graph::exec_and_wait( sycl::queue q ) {
-    my_graph->exec_and_wait(q);
-};
+void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
 
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
-
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 4dad99a7ffca6..a5f02604716d6 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -874,7 +874,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->Properties & (1<<11) ) == 0);
+  return ((this->Properties & (1 << 11)) == 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
@@ -1140,10 +1140,9 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
 }
 
 // Retrieve an available command list to be used in a PI call.
-pi_result
-_pi_context::getAvailableCommandList(pi_queue Queue,
-                                     pi_command_list_ptr_t &CommandList,
-                                     bool UseCopyEngine, bool AllowBatching, bool Graph) {
+pi_result _pi_context::getAvailableCommandList(
+    pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
+    bool AllowBatching, bool Graph) {
   // Immediate commandlists have been pre-allocated and are always available.
   if (UseImmediateCommandLists) {
     CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
@@ -1269,13 +1268,13 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
             ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal}));
     pi_result = PI_SUCCESS;
   }
-
-} else {
+}
+else {
   CommandList = Queue->CommandListMap.begin();
   pi_result = PI_SUCCESS;
 }
 
-  return pi_result;
+return pi_result;
 }
 
 void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) {
@@ -1356,8 +1355,7 @@ void _pi_queue::CaptureIndirectAccesses() {
 }
 
 pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
-                                        bool IsBlocking,
-                                        bool OKToBatchCommand, 
+                                        bool IsBlocking, bool OKToBatchCommand,
                                         bool Graph) {
   bool UseCopyEngine = CommandList->second.isCopy(this);
 
@@ -1477,7 +1475,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
     }
 
     // Close the command list and have it ready for dispatch.
-    // TODO: Close command list only once before initial execution, but works as is.
+    // TODO: Close command list only once before initial execution, but works as
+    // is.
     ZE_CALL(zeCommandListClose, (CommandList->first));
     // Offload command list to the GPU for asynchronous execution
     auto ZeCommandList = CommandList->first;
@@ -2063,10 +2062,11 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
   }
 
   zePrint("Using events scope: %s\n",
-          EventsScope == AllHostVisible ? "all host-visible"
-          : EventsScope == OnDemandHostVisibleProxy
-              ? "on demand host-visible proxy"
-              : "only last command in a batch is host-visible");
+          EventsScope == AllHostVisible
+              ? "all host-visible"
+              : EventsScope == OnDemandHostVisibleProxy
+                    ? "on demand host-visible proxy"
+                    : "only last command in a batch is host-visible");
   return PI_SUCCESS;
 }
 
@@ -4942,12 +4942,12 @@ pi_result piKernelRelease(pi_kernel Kernel) {
   return PI_SUCCESS;
 }
 
-pi_result
-piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
-                      const size_t *GlobalWorkOffset,
-                      const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
-                      pi_uint32 NumEventsInWaitList,
-                      const pi_event *EventWaitList, pi_event *Event) {
+pi_result piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+                          const size_t *GlobalWorkOffset,
+                          const size_t *GlobalWorkSize,
+                          const size_t *LocalWorkSize,
+                          pi_uint32 NumEventsInWaitList,
+                          const pi_event *EventWaitList, pi_event *Event) {
   PI_ASSERT(Kernel, PI_INVALID_KERNEL);
   PI_ASSERT(Queue, PI_INVALID_QUEUE);
   PI_ASSERT(Event, PI_INVALID_EVENT);
@@ -5122,27 +5122,26 @@ piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
   return PI_SUCCESS;
 }
 
-pi_result
-piKernelLaunch(pi_queue Queue) {
-   
-    const bool Graph = !(Queue->isEagerExec());
-    //const bool Graph = true;
- 
-    //TODO: Make sure (re-)execute specific command list.
+pi_result piKernelLaunch(pi_queue Queue) {
 
-    // Get a new command list to be used on this call
-      pi_command_list_ptr_t CommandList{};
-      if (auto Res = Queue->Context->getAvailableCommandList(
-              Queue, CommandList, false /* PreferCopyEngine */,
-              true /* AllowBatching */, Graph /* Shortcut for Graph */))
-        return Res;
-    
-    // Execute command list asynchronously, as the event will be used
-    // to track down its completion.
-    if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph))
-      return Res;
+  const bool Graph = !(Queue->isEagerExec());
+  // const bool Graph = true;
 
-    return PI_SUCCESS;
+  // TODO: Make sure (re-)execute specific command list.
+
+  // Get a new command list to be used on this call
+  pi_command_list_ptr_t CommandList{};
+  if (auto Res = Queue->Context->getAvailableCommandList(
+          Queue, CommandList, false /* PreferCopyEngine */,
+          true /* AllowBatching */, Graph /* Shortcut for Graph */))
+    return Res;
+
+  // Execute command list asynchronously, as the event will be used
+  // to track down its completion.
+  if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph))
+    return Res;
+
+  return PI_SUCCESS;
 }
 
 pi_result
@@ -5151,14 +5150,15 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
                       const pi_event *EventWaitList, pi_event *Event) {
-    auto Res =
-    piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
+  auto Res =
+      piEnqueueKernel(Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize,
+                      LocalWorkSize, NumEventsInWaitList, EventWaitList, Event);
 #if 1
-    if(Res == PI_SUCCESS && Queue->isEagerExec()) {
-        return piKernelLaunch(Queue);
-    }
+  if (Res == PI_SUCCESS && Queue->isEagerExec()) {
+    return piKernelLaunch(Queue);
+  }
 #endif
-    return Res;
+  return Res;
 }
 
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index b0a199876bd78..47eac0846f43a 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -919,7 +919,8 @@ struct _pi_queue : _pi_object {
   // For immediate commandlists, no close and execute is necessary.
   pi_result executeCommandList(pi_command_list_ptr_t CommandList,
                                bool IsBlocking = false,
-                               bool OKToBatchCommand = false, bool Graph = false);
+                               bool OKToBatchCommand = false,
+                               bool Graph = false);
 
   // If there is an open command list associated with this queue,
   // close it, execute it, and reset the corresponding OpenCommandList.

From 7208ad4cd3943763772b5b312d4f07c60337f4dc Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 6 May 2022 11:54:58 -0500
Subject: [PATCH 13/18] fix issue introd. by recent merge

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 245 +++++++++++-----------
 1 file changed, 123 insertions(+), 122 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index a5f02604716d6..c598f3432dd5e 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1143,138 +1143,139 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
 pi_result _pi_context::getAvailableCommandList(
     pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
     bool AllowBatching, bool Graph) {
-  // Immediate commandlists have been pre-allocated and are always available.
-  if (UseImmediateCommandLists) {
-    CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
-    return PI_SUCCESS;
-  }
-
-  auto &CommandBatch =
-      UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
-  // Handle batching of commands
-  // First see if there is an command-list open for batching commands
-  // for this queue.
-  if (Queue->hasOpenCommandList(UseCopyEngine)) {
-    if (AllowBatching) {
-      CommandList = CommandBatch.OpenCommandList;
+  // TODO: Do proper CommandList allocation. This is a hack!
+  if (!Graph) {
+    // Immediate commandlists have been pre-allocated and are always available.
+    if (UseImmediateCommandLists) {
+      CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
       return PI_SUCCESS;
     }
-    // If this command isn't allowed to be batched, then we need to
-    // go ahead and execute what is already in the batched list,
-    // and then go on to process this. On exit from executeOpenCommandList
-    // OpenCommandList will be invalidated.
-    if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
-      return Res;
-  }
 
-  // Create/Reuse the command list, because in Level Zero commands are added to
-  // the command lists, and later are then added to the command queue.
-  // Each command list is paired with an associated fence to track when the
-  // command list is available for reuse.
-  //_pi_result pi_result = PI_OUT_OF_RESOURCES;
-  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
-  // Initally, we need to check if a command list has already been created
-  // on this device that is available for use. If so, then reuse that
-  // Level-Zero Command List and Fence for this PI call.
-  {
-    // Make sure to acquire the lock before checking the size, or there
-    // will be a race condition.
-    std::lock_guard<std::mutex> lock(Queue->Context->ZeCommandListCacheMutex);
-    // Under mutex since operator[] does insertion on the first usage for every
-    // unique ZeDevice.
-    auto &ZeCommandListCache =
-        UseCopyEngine
-            ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
-            : Queue->Context
-                  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
-
-    if (ZeCommandListCache.size() > 0) {
-      auto &ZeCommandList = ZeCommandListCache.front();
-      auto it = Queue->CommandListMap.find(ZeCommandList);
-      if (it != Queue->CommandListMap.end()) {
-        CommandList = it;
-        CommandList->second.InUse = true;
-      } else {
-        // If there is a command list available on this context, but it
-        // wasn't yet used in this queue then create a new entry in this
-        // queue's map to hold the fence and other associated command
-        // list information.
-        uint32_t QueueGroupOrdinal;
-        auto &ZeCommandQueue =
-            Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal);
-
-        ze_fence_handle_t ZeFence;
-        ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-        CommandList =
-            Queue->CommandListMap
-                .emplace(ZeCommandList,
-                         pi_command_list_info_t{ZeFence, true, ZeCommandQueue,
-                                                QueueGroupOrdinal})
-                .first;
+    auto &CommandBatch =
+        UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+    // Handle batching of commands
+    // First see if there is an command-list open for batching commands
+    // for this queue.
+    if (Queue->hasOpenCommandList(UseCopyEngine)) {
+      if (AllowBatching) {
+        CommandList = CommandBatch.OpenCommandList;
+        return PI_SUCCESS;
       }
-      ZeCommandListCache.pop_front();
-      return PI_SUCCESS;
+      // If this command isn't allowed to be batched, then we need to
+      // go ahead and execute what is already in the batched list,
+      // and then go on to process this. On exit from executeOpenCommandList
+      // OpenCommandList will be invalidated.
+      if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
+        return Res;
     }
-  }
-
-  // If there are no available command lists in the cache, then we check for
-  // command lists that have already signalled, but have not been added to the
-  // available list yet. Each command list has a fence associated which tracks
-  // if a command list has completed dispatch of its commands and is ready for
-  // reuse. If a command list is found to have been signalled, then the
-  // command list & fence are reset and we return.
-  for (auto it = Queue->CommandListMap.begin();
-       it != Queue->CommandListMap.end(); ++it) {
-    // Make sure this is the command list type needed.
-    if (UseCopyEngine != it->second.isCopy(Queue))
-      continue;
 
-    ze_result_t ZeResult =
-        ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
-    if (ZeResult == ZE_RESULT_SUCCESS) {
-      Queue->resetCommandList(it, false);
-      CommandList = it;
-      CommandList->second.InUse = true;
-      return PI_SUCCESS;
+    // Create/Reuse the command list, because in Level Zero commands are added
+    // to the command lists, and later are then added to the command queue. Each
+    // command list is paired with an associated fence to track when the command
+    // list is available for reuse.
+    _pi_result pi_result = PI_OUT_OF_RESOURCES;
+    ZeStruct<ze_fence_desc_t> ZeFenceDesc;
+    // Initally, we need to check if a command list has already been created
+    // on this device that is available for use. If so, then reuse that
+    // Level-Zero Command List and Fence for this PI call.
+    {
+      // Make sure to acquire the lock before checking the size, or there
+      // will be a race condition.
+      std::lock_guard<std::mutex> lock(Queue->Context->ZeCommandListCacheMutex);
+      // Under mutex since operator[] does insertion on the first usage for
+      // every unique ZeDevice.
+      auto &ZeCommandListCache =
+          UseCopyEngine
+              ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
+              : Queue->Context
+                    ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
+
+      if (ZeCommandListCache.size() > 0) {
+        auto &ZeCommandList = ZeCommandListCache.front();
+        auto it = Queue->CommandListMap.find(ZeCommandList);
+        if (it != Queue->CommandListMap.end()) {
+          CommandList = it;
+          CommandList->second.InUse = true;
+        } else {
+          // If there is a command list available on this context, but it
+          // wasn't yet used in this queue then create a new entry in this
+          // queue's map to hold the fence and other associated command
+          // list information.
+          uint32_t QueueGroupOrdinal;
+          auto &ZeCommandQueue = Queue->getQueueGroup(UseCopyEngine)
+                                     .getZeQueue(&QueueGroupOrdinal);
+
+          ze_fence_handle_t ZeFence;
+          ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+          CommandList =
+              Queue->CommandListMap
+                  .emplace(ZeCommandList,
+                           pi_command_list_info_t{ZeFence, true, ZeCommandQueue,
+                                                  QueueGroupOrdinal})
+                  .first;
+        }
+        ZeCommandListCache.pop_front();
+        return PI_SUCCESS;
+      }
     }
-  }
-
-  // If there are no available command lists nor signalled command lists, then
-  // we must create another command list if we have not exceed the maximum
-  // command lists we can create.
-  // Once created, this command list & fence are added to the command list fence
-  // map.
-  if (Queue->Device->Platform->ZeGlobalCommandListCount <
-      ZeMaxCommandListCacheSize) {
-    ze_command_list_handle_t ZeCommandList;
-    ze_fence_handle_t ZeFence;
-
-    uint32_t QueueGroupOrdinal;
-    auto &ZeCommandQueue =
-        Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal);
 
-    ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
-    ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
-
-    ZE_CALL(zeCommandListCreate,
-            (Queue->Context->ZeContext, Queue->Device->ZeDevice,
-             &ZeCommandListDesc, &ZeCommandList));
-    // Increments the total number of command lists created on this platform.
-    Queue->Device->Platform->ZeGlobalCommandListCount++;
+    // If there are no available command lists in the cache, then we check for
+    // command lists that have already signalled, but have not been added to the
+    // available list yet. Each command list has a fence associated which tracks
+    // if a command list has completed dispatch of its commands and is ready for
+    // reuse. If a command list is found to have been signalled, then the
+    // command list & fence are reset and we return.
+    for (auto it = Queue->CommandListMap.begin();
+         it != Queue->CommandListMap.end(); ++it) {
+      // Make sure this is the command list type needed.
+      if (UseCopyEngine != it->second.isCopy(Queue))
+        continue;
+
+      ze_result_t ZeResult =
+          ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
+      if (ZeResult == ZE_RESULT_SUCCESS) {
+        Queue->resetCommandList(it, false);
+        CommandList = it;
+        CommandList->second.InUse = true;
+        return PI_SUCCESS;
+      }
+    }
 
-    ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-    std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert(
-        std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
-            ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal}));
-    pi_result = PI_SUCCESS;
+    // If there are no available command lists nor signalled command lists, then
+    // we must create another command list if we have not exceed the maximum
+    // command lists we can create.
+    // Once created, this command list & fence are added to the command list
+    // fence map.
+    if (Queue->Device->Platform->ZeGlobalCommandListCount <
+        ZeMaxCommandListCacheSize) {
+      ze_command_list_handle_t ZeCommandList;
+      ze_fence_handle_t ZeFence;
+
+      uint32_t QueueGroupOrdinal;
+      auto &ZeCommandQueue =
+          Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal);
+
+      ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
+      ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
+
+      ZE_CALL(zeCommandListCreate,
+              (Queue->Context->ZeContext, Queue->Device->ZeDevice,
+               &ZeCommandListDesc, &ZeCommandList));
+      // Increments the total number of command lists created on this platform.
+      Queue->Device->Platform->ZeGlobalCommandListCount++;
+
+      ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+      std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert(
+          std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
+              ZeCommandList,
+              {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal}));
+      pi_result = PI_SUCCESS;
+    }
+    return pi_result;
+  } else {
+    CommandList = Queue->CommandListMap.begin();
   }
-}
-else {
-  CommandList = Queue->CommandListMap.begin();
-  pi_result = PI_SUCCESS;
-}
-
-return pi_result;
+  return PI_SUCCESS;
 }
 
 void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) {

From 5d4eab16d87aec0cb3be643085f11d8b5c465a95 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Tue, 10 May 2022 11:25:51 -0500
Subject: [PATCH 14/18] fix formatting

---
 sycl/source/detail/queue_impl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index d812ac8a4e356..7aefc92a3567f 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -277,10 +277,10 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 #endif
 
 #if 1
-  if(has_property<ext::oneapi::property::queue::lazy_execution>()){
+  if (has_property<ext::oneapi::property::queue::lazy_execution>()) {
     const detail::plugin &Plugin = getPlugin();
     if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
-        Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
+      Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
   }
 #endif
 

From 5956e20ee944942337d13de6b8c9f6e0f72e9039 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Wed, 25 May 2022 11:33:17 -0500
Subject: [PATCH 15/18] Extend API and incorporate more feedback.

---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 224 ++++++++++++++++--
 1 file changed, 202 insertions(+), 22 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index fea2d783aad24..4ec435f0cf23f 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -60,6 +60,7 @@ Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro.
 == SYCL Graph Terminology
 
 Table 2. Terminology.
+[%header,cols="1,3"]
 |===
 |Concept|Description
 |graph| Class that stores structured work units and their dependencies
@@ -81,8 +82,6 @@ namespace sycl::ext::oneapi::experimental {
 }
 ----
 
-NOTE:
-
 == Edge
 
 A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs.
@@ -100,7 +99,7 @@ namespace sycl::ext::oneapi::experimental {
 
 Graph is a class that represents a directed acyclic graph of nodes. 
 A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed.
-Member functions as listed in Table 2 and 3 can be used to add nodes to a graph.
+Member functions as listed in Table 3 to 6 can be used to add nodes to a graph.
 
 [source,c++]
 ----
@@ -123,7 +122,18 @@ namespace sycl::ext::oneapi::experimental {
   
 }
 
-sycl::event sycl::queue(const graph<graph_state::executable> Graph);
+----
+
+The following member functions are added to the queue class.
+
+[source,c++]
+----
+
+namespace sycl {
+
+event submit(const ext::oneapi::experimental::graph<ext::oneapi::experimental::graph_state::executable>& my_graph);
+
+} // namespace sycl
 
 ----
 
@@ -133,46 +143,209 @@ A `graph` object in `graph_state::executable` represents a user generated device
 The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node.
 Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
+=== Graph member and helper functions
+
 Table 3. Constructor of the `graph` class.
+[cols="2a,a"]
 |===
 |Constructor|Description
 
-|`graph()`
-|Creates a `graph` object. It's default state is `graph_state::modifiable`.
+|
+[source,c++]
+----
+/* available only when graph_state == modifiable */`
+graph();
+----
+|Creates a `graph` object.
 
 |===
 
 Table 4. Member functions of the `graph` class.
+[cols="2a,a"]
 |===
 |Member function|Description
 
-|`node add_node(const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+node add_node(const std::vector<node>& dep = {});
+----
 |This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
 
-|`template<typename T>
-    	node add_node(T cgf, const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+template<typename T>
+    node add_node(T cgf, const std::vector<node>& dep = {});
+----
 |This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec.
 
 |===
 
+Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed.
+
 Table 5. Member functions of the `graph` class (memory operations).
+[cols="2a,a"]
 |===
 |Member function|Description
 
-|`node add_memcpy_node(void* dest, const void* src, size_t numBytes, const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+node memcpy(void* dest, const void* src, size_t numBytes, const std::vector<node>& dep = {});
+----
 |Adding a node that encapsulates a `memcpy` operation.
 
-|`node add_memset_node(void* ptr, int value, size_t numBytes, const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+template<typename T> node
+copy(const T* src, T* dest, size_t count, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `copy` operation.
+
+|
+[source,c++]
+----
+node memset(void* ptr, int value, size_t numBytes, const std::vector<node>& dep = {});
+----
 |Adding a node that encapsulates a `memset` operation.
 
-|`node add_malloc_node(void *data, size_t numBytes, usm::alloc kind, const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+template<typename T>
+node fill(void* ptr, const T& pattern, size_t count, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `fill` operation.
+
+|
+[source,c++]
+----
+node malloc(void *data, size_t numBytes, usm::alloc kind, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `malloc` operation.
+
+|
+[source,c++]
+----
+node malloc_shared(void *data, size_t numBytes, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `malloc` operation.
+
+|
+[source,c++]
+----
+node malloc_host(void *data, size_t numBytes, const std::vector<node>& dep = {});
+----
 |Adding a node that encapsulates a `malloc` operation.
 
-|`node add_free_node(void *data, const std::vector<node>& dep = {});`
+|
+[source,c++]
+----
+node malloc_device(void *data, size_t numBytes, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `malloc` operation.
+
+|
+[source,c++]
+----
+node free(void *data, const std::vector<node>& dep = {});
+----
 |Adding a node that encapsulates a `free` operation.
 
 |===
 
+Table 6. Member functions of the `graph` class (convenience shortcuts).
+[cols="2a,a"]
+|===
+|Member function|Description
+
+|
+[source,c++]
+----
+template <typename KernelName, typename KernelType>
+node single_task(const KernelType &kernelFunc, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `single_task` operation.
+
+|
+[source,c++]
+----
+template <typename KernelName, int Dims, typename... Rest>
+node parallel_for(range<Dims> numWorkItems, Rest&& rest, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `parallel_for` operation.
+
+|
+[source,c++]
+----
+template <typename KernelName, int Dims, typename... Rest>
+node parallel_for(nd_range<Dims> executionRange, Rest&& rest, const std::vector<node>& dep = {});
+----
+|Adding a node that encapsulates a `parallel_for` operation.
+
+|===
+
+Table 7. Helper functions of the `graph` class.
+[cols="a,a"]
+|===
+|Function name|Description
+
+|
+[source,c++]
+----
+graph<graph_state::modifiable> make_graph();
+----
+|Creates a `graph` object. It's state is `graph_state::modifiable`.
+
+|===
+
+=== Node member functions
+
+Table 8. Constructor of the `node` class.
+[cols="a,a"]
+|===
+|Constructor|Description
+
+|
+[source,c++]
+----
+node();
+----
+|Creates an empty `node` object. That encapsulates no tasks and is not assigned to a graph. Prior to execution it has to be assigned to a graph.
+
+|===
+
+Table 9. Member functions of the `node` class.
+[cols="2a,a"]
+|===
+|Function name|Description
+
+|
+[source,c++]
+----
+void set_graph(graph<graph_state::modifiable>& Graph);
+----
+|Assigns a `node` object to a `graph`.
+
+|
+[source,c++]
+----
+template<typename T>
+void update(T cgf);
+----
+|Update a `node` object.
+
+|
+[source,c++]
+----
+template<typename T>
+void update(T cgf, graph<graph_state::modifiable>& Graph);
+----
+|Update a `node` object and assign it to a task.
+
+|===
 
 == Examples
 
@@ -196,31 +369,35 @@ int main() {
 
   auto g = sycl::ext::oneapi::experimental::make_graph();
 
-  float *x = sycl::malloc_shared<float>(n, q);
-  float *y = sycl::malloc_shared<float>(n, q);
-  float *z = sycl::malloc_shared<float>(n, q);
+  float *x , *y, *z;
+
+  auto n_x = g.malloc_shared<float>(x, n, q);
+  auto n_y = g.malloc_shared<float>(y, n, q);
+  auto n_z = g.malloc_shared<float>(z, n, q);
 
   float *dotp = sycl::malloc_shared<float>(1, q);
 
-  for (int i = 0; i < n; i++) {
+  /* init data by using usm shortcut */
+  auto n_i = g.parallel_for(n, [=](sycl::id<1> it){
+    const size_t i = it[0];
     x[i] = 1.0f;
     y[i] = 2.0f;
     z[i] = 3.0f;
-  }
+  }, {n_x, n_y, n_z});
 
   auto node_a = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       x[i] = alpha * x[i] + beta * y[i];
     });
-  });
+  }, {n_i});
 
   auto node_b = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       z[i] = gamma * z[i] + beta * y[i];
     });
-  });
+  }, {n_i});
 
   auto node_c = g.add_node(
       [&](sycl::handler &h) {
@@ -232,13 +409,15 @@ int main() {
                        });
       },
       {node_a, node_b});
+      
+  auto node_f1 = g.free(x, {node_c});
+  auto node_f1 = g.free(y, {node_b});
 
   auto exec = compile(q);
 
   q.submit(exec).wait();
 
-  sycl::free(x, q);
-  sycl::free(y, q);
+  // memory can be freed inside or outside the graph
   sycl::free(z, q);
   sycl::free(dotp, q);
 
@@ -271,4 +450,5 @@ Please, note that the following features are not yet implemented:
 |Rev|Date|Author|Changes
 |1|2022-02-11|Pablo Reble|Initial public working draft
 |2|2022-03-11|Pablo Reble|Incorporate feedback from PR
+|3|2022-05-25|Pablo Reble|Extend API and Example
 |========================================

From be05f1dc588a53092be69be1b292336dc32842c2 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 30 Sep 2022 14:10:47 -0500
Subject: [PATCH 16/18] Delete SYCL_EXT_ONEAPI_GRAPH.asciidoc

Developing doc in separate branch
---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 454 ------------------
 1 file changed, 454 deletions(-)
 delete mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
deleted file mode 100644
index 4ec435f0cf23f..0000000000000
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ /dev/null
@@ -1,454 +0,0 @@
-= SYCL_EXT_ONEAPI_GRAPH
-:source-highlighter: coderay
-:coderay-linenums-mode: table
-
-// This section needs to be after the document title.
-:doctype: book
-:toc2:
-:toc: left
-:encoding: utf-8
-:lang: en
-
-:blank: pass:[ +]
-
-// Set the default source code type in this document to C++,
-// for syntax highlighting purposes.  This is needed because
-// docbook uses c++ and html5 uses cpp.
-:language: {basebackend@docbook:c++:cpp}
-
-== Notice
-
-Copyright (c) 2022 Intel Corporation.  All rights reserved.
-
-IMPORTANT: This specification is a draft.
-
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
-
-NOTE: This document is better viewed when rendered as html with asciidoctor.
-GitHub does not render image icons.
-
-This extension is written against the SYCL 2020 revision 4 specification. All
-references below to the "core SYCL specification" or to section numbers in the
-SYCL specification refer to that revision.
-
-NOTE: This extension is experimental: interfaces are subject to change later.
-
-== Introduction
-
-This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating
-its definition and execution.
-
-== Feature test macro
-
-This extension provides a feature-test macro as described in the core SYCL
-specification section 6.3.3 "Feature test macros".  Therefore, an
-implementation supporting this extension must predefine the macro
-`SYCL_EXT_ONEAPI_GRAPH` to one of the values defined in the table below.
-Applications can test for the existence of this macro to determine if the
-implementation supports this feature, or applications can test the macro's
-value to determine which of the extension's APIs the implementation supports.
-
-Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro.
-[%header,cols="1,5"]
-|===
-|Value |Description
-|1     |Initial extension version. Base features are supported.
-|===
-
-== SYCL Graph Terminology
-
-Table 2. Terminology.
-[%header,cols="1,3"]
-|===
-|Concept|Description
-|graph| Class that stores structured work units and their dependencies
-|node| The unit of work. Can have different attributes.
-|edge| Dependency between work units. Happens-before relation.
-|===
-
-== Node
-
-Node is a class that encapsulates tasks like SYCL kernel functions or host tasks for deferred execution.
-A graph has to be created first, the structure of a graph is defined second by adding nodes and edges.
-
-[source,c++]
-----
-namespace sycl::ext::oneapi::experimental {
-
-  class node{
-  };
-}
-----
-
-== Edge
-
-A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs.
-
-[source,c++]
-----
-namespace sycl::ext::oneapi::experimental {
-
-	// Adding dependency between two nodes.
-    	void make_edge(node sender, node receiver);
-}
-----
-
-== Graph
-
-Graph is a class that represents a directed acyclic graph of nodes. 
-A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed.
-Member functions as listed in Table 3 to 6 can be used to add nodes to a graph.
-
-[source,c++]
-----
-namespace sycl::ext::oneapi::experimental {
-
-  enum class graph_state{
-    modifiable,
-    executable
-  };
-
-  template<graph_state State>
-  class graph {
-  public:
-    operator graph<graph_state::executable>();
-  };
-  
-  graph<graph_state::modifiable> make_graph();
-  
-  graph<graph_state::executable> compile(const graph<graph_state::modifiable> Graph);
-  
-}
-
-----
-
-The following member functions are added to the queue class.
-
-[source,c++]
-----
-
-namespace sycl {
-
-event submit(const ext::oneapi::experimental::graph<ext::oneapi::experimental::graph_state::executable>& my_graph);
-
-} // namespace sycl
-
-----
-
-=== Executable Graph
-
-A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
-The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node.
-Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
-
-=== Graph member and helper functions
-
-Table 3. Constructor of the `graph` class.
-[cols="2a,a"]
-|===
-|Constructor|Description
-
-|
-[source,c++]
-----
-/* available only when graph_state == modifiable */`
-graph();
-----
-|Creates a `graph` object.
-
-|===
-
-Table 4. Member functions of the `graph` class.
-[cols="2a,a"]
-|===
-|Member function|Description
-
-|
-[source,c++]
-----
-node add_node(const std::vector<node>& dep = {});
-----
-|This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
-
-|
-[source,c++]
-----
-template<typename T>
-    node add_node(T cgf, const std::vector<node>& dep = {});
-----
-|This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec.
-
-|===
-
-Memory that is allocated by the following functions is owned by the specific graph. When freed inside the graph, the memory is only accessible before the `free` node is executed and after the `malloc` node is executed.
-
-Table 5. Member functions of the `graph` class (memory operations).
-[cols="2a,a"]
-|===
-|Member function|Description
-
-|
-[source,c++]
-----
-node memcpy(void* dest, const void* src, size_t numBytes, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `memcpy` operation.
-
-|
-[source,c++]
-----
-template<typename T> node
-copy(const T* src, T* dest, size_t count, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `copy` operation.
-
-|
-[source,c++]
-----
-node memset(void* ptr, int value, size_t numBytes, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `memset` operation.
-
-|
-[source,c++]
-----
-template<typename T>
-node fill(void* ptr, const T& pattern, size_t count, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `fill` operation.
-
-|
-[source,c++]
-----
-node malloc(void *data, size_t numBytes, usm::alloc kind, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `malloc` operation.
-
-|
-[source,c++]
-----
-node malloc_shared(void *data, size_t numBytes, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `malloc` operation.
-
-|
-[source,c++]
-----
-node malloc_host(void *data, size_t numBytes, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `malloc` operation.
-
-|
-[source,c++]
-----
-node malloc_device(void *data, size_t numBytes, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `malloc` operation.
-
-|
-[source,c++]
-----
-node free(void *data, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `free` operation.
-
-|===
-
-Table 6. Member functions of the `graph` class (convenience shortcuts).
-[cols="2a,a"]
-|===
-|Member function|Description
-
-|
-[source,c++]
-----
-template <typename KernelName, typename KernelType>
-node single_task(const KernelType &kernelFunc, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `single_task` operation.
-
-|
-[source,c++]
-----
-template <typename KernelName, int Dims, typename... Rest>
-node parallel_for(range<Dims> numWorkItems, Rest&& rest, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `parallel_for` operation.
-
-|
-[source,c++]
-----
-template <typename KernelName, int Dims, typename... Rest>
-node parallel_for(nd_range<Dims> executionRange, Rest&& rest, const std::vector<node>& dep = {});
-----
-|Adding a node that encapsulates a `parallel_for` operation.
-
-|===
-
-Table 7. Helper functions of the `graph` class.
-[cols="a,a"]
-|===
-|Function name|Description
-
-|
-[source,c++]
-----
-graph<graph_state::modifiable> make_graph();
-----
-|Creates a `graph` object. It's state is `graph_state::modifiable`.
-
-|===
-
-=== Node member functions
-
-Table 8. Constructor of the `node` class.
-[cols="a,a"]
-|===
-|Constructor|Description
-
-|
-[source,c++]
-----
-node();
-----
-|Creates an empty `node` object. That encapsulates no tasks and is not assigned to a graph. Prior to execution it has to be assigned to a graph.
-
-|===
-
-Table 9. Member functions of the `node` class.
-[cols="2a,a"]
-|===
-|Function name|Description
-
-|
-[source,c++]
-----
-void set_graph(graph<graph_state::modifiable>& Graph);
-----
-|Assigns a `node` object to a `graph`.
-
-|
-[source,c++]
-----
-template<typename T>
-void update(T cgf);
-----
-|Update a `node` object.
-
-|
-[source,c++]
-----
-template<typename T>
-void update(T cgf, graph<graph_state::modifiable>& Graph);
-----
-|Update a `node` object and assign it to a task.
-
-|===
-
-== Examples
-
-NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document.
-
-1. Dot product
-
-[source,c++]
-----
-...
-
-#include <sycl/ext/oneapi/experimental/graph.hpp>
-
-int main() {
-  const size_t n = 10;
-  float alpha = 1.0f;
-  float beta = 2.0f;
-  float gamma = 3.0f;
-
-  sycl::queue q;
-
-  auto g = sycl::ext::oneapi::experimental::make_graph();
-
-  float *x , *y, *z;
-
-  auto n_x = g.malloc_shared<float>(x, n, q);
-  auto n_y = g.malloc_shared<float>(y, n, q);
-  auto n_z = g.malloc_shared<float>(z, n, q);
-
-  float *dotp = sycl::malloc_shared<float>(1, q);
-
-  /* init data by using usm shortcut */
-  auto n_i = g.parallel_for(n, [=](sycl::id<1> it){
-    const size_t i = it[0];
-    x[i] = 1.0f;
-    y[i] = 2.0f;
-    z[i] = 3.0f;
-  }, {n_x, n_y, n_z});
-
-  auto node_a = g.add_node([&](sycl::handler &h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-      const size_t i = it[0];
-      x[i] = alpha * x[i] + beta * y[i];
-    });
-  }, {n_i});
-
-  auto node_b = g.add_node([&](sycl::handler &h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-      const size_t i = it[0];
-      z[i] = gamma * z[i] + beta * y[i];
-    });
-  }, {n_i});
-
-  auto node_c = g.add_node(
-      [&](sycl::handler &h) {
-        h.parallel_for(sycl::range<1>{n},
-                       sycl::reduction(dotp, 0.0f, std::plus()),
-                       [=](sycl::id<1> it, auto &sum) {
-                         const size_t i = it[0];
-                         sum += x[i] * z[i];
-                       });
-      },
-      {node_a, node_b});
-      
-  auto node_f1 = g.free(x, {node_c});
-  auto node_f1 = g.free(y, {node_b});
-
-  auto exec = compile(q);
-
-  q.submit(exec).wait();
-
-  // memory can be freed inside or outside the graph
-  sycl::free(z, q);
-  sycl::free(dotp, q);
-
-  return 0;
-}
-
-
-...
-----
-
-== Issues for later investigations
-
-. Explicit memory movement can cause POC to stall.
-
-== Non-implemented features
-Please, note that the following features are not yet implemented:
-
-. Level Zero backend only
-. Memory operation nodes not implemented
-. Host node not implemented
-. Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead.
-. `class graph<graph_state>` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead.
-
-== Revision History
-
-[cols="5,15,15,70"]
-[grid="rows"]
-[options="header"]
-|========================================
-|Rev|Date|Author|Changes
-|1|2022-02-11|Pablo Reble|Initial public working draft
-|2|2022-03-11|Pablo Reble|Incorporate feedback from PR
-|3|2022-05-25|Pablo Reble|Extend API and Example
-|========================================

From f03870eb84ed18af61729b5f7cfa02a8bc3d00c9 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Wed, 12 Oct 2022 08:03:55 -0700
Subject: [PATCH 17/18] update API to recent proposal

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 71 +++++++++----------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 2d61c03be02e6..304a928f4e7e4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -125,12 +125,6 @@ struct graph_impl {
 
 } // namespace detail
 
-class node;
-
-class graph;
-
-class executable_graph;
-
 struct node {
   // TODO: add properties to distinguish between empty, host, device nodes.
   detail::node_ptr my_node;
@@ -147,31 +141,20 @@ struct node {
   // TODO: Add query functions: is_root, ...
 };
 
-class executable_graph {
-public:
-  int my_tag;
-  sycl::queue my_queue;
-
-  void exec_and_wait(); // { my_queue.wait(); }
-
-  executable_graph(detail::graph_ptr g, sycl::queue q)
-      : my_queue(q), my_tag(rand()) {
-    g->exec(my_queue);
-  }
+enum class graph_state{
+  modifiable,
+  executable
 };
 
-class graph {
+template<graph_state State=graph_state::modifiable>
+class command_graph {
 public:
   // Adding empty node with [0..n] predecessors:
-  node add_empty_node(const std::vector<node> &dep = {});
-
-  // Adding node for host task
-  template <typename T>
-  node add_host_node(T hostTaskCallable, const std::vector<node> &dep = {});
+  node add(const std::vector<node> &dep = {});
 
   // Adding device node:
   template <typename T>
-  node add_device_node(T cgf, const std::vector<node> &dep = {});
+  node add(T cgf, const std::vector<node> &dep = {});
 
   // Adding dependency between two nodes.
   void make_edge(node sender, node receiver);
@@ -179,25 +162,33 @@ class graph {
   // TODO: Extend queue to directly submit graph
   void exec_and_wait(sycl::queue q);
 
-  executable_graph exec(sycl::queue q) {
-    return executable_graph{my_graph, q};
-  };
+  command_graph<graph_state::executable> finalize(const sycl::context &syclContext) const;
+
+  command_graph() : my_graph(new detail::graph_impl()) {}
 
-  graph() : my_graph(new detail::graph_impl()) {}
+private:
+  detail::graph_ptr my_graph;
+};
 
-  // Creating a subgraph (with predecessors)
-  graph(graph &parent, const std::vector<node> &dep = {}) {}
+template<>
+class command_graph<graph_state::executable>{
+public:
+  int my_tag;
+  const sycl::context& my_ctx;
 
-  bool is_subgraph();
+  void exec_and_wait(sycl::queue q);
+
+  command_graph() = delete;
+
+  command_graph(detail::graph_ptr g, const sycl::context& ctx)
+      : my_graph(g) , my_ctx(ctx), my_tag(rand()) {}
 
 private:
   detail::graph_ptr my_graph;
 };
 
-void executable_graph::exec_and_wait() { my_queue.wait(); }
-
-template <typename T>
-node graph::add_device_node(T cgf, const std::vector<node> &dep) {
+template<> template<typename T>
+node command_graph<graph_state::modifiable>::add(T cgf, const std::vector<node> &dep) {
   node _node(my_graph, cgf);
   if (!dep.empty()) {
     for (auto n : dep)
@@ -208,13 +199,19 @@ node graph::add_device_node(T cgf, const std::vector<node> &dep) {
   return _node;
 }
 
-void graph::make_edge(node sender, node receiver) {
+template<>
+void command_graph<graph_state::modifiable>::make_edge(node sender, node receiver) {
   sender.register_successor(receiver);     // register successor
   my_graph->remove_root(receiver.my_node); // remove receiver from root node
                                            // list
 }
 
-void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
+template<>
+command_graph<graph_state::executable> command_graph<graph_state::modifiable>::finalize(const sycl::context &ctx) const {
+  return command_graph<graph_state::executable>{ this->my_graph, ctx };
+}
+
+void command_graph<graph_state::executable>::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
 
 } // namespace experimental
 } // namespace oneapi

From 9fe0962084be36915f6b886fb7668c061eaaee71 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Wed, 12 Oct 2022 11:33:57 -0700
Subject: [PATCH 18/18] fix rebase issue

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c598f3432dd5e..058437e0285c2 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -874,7 +874,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->Properties & (1 << 11)) == 0);
+  return ((this->Properties & PI_QUEUE_LAZY_EXECUTION) == 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,