diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
new file mode 100644
index 0000000000000..2f5376a3a536c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
@@ -0,0 +1,37 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected Fail as exception not implemented yet
+// XFAIL: *
+
+// Tests attempting to add a node to a command_graph while it is being
+// recorded to by a queue is an error.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  bool Success = false;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  Graph.begin_recording(Queue);
+
+  try {
+    Graph.add([&](handler &CGH) {});
+  } catch (sycl::exception &E) {
+    auto StdErrc = E.code().value();
+    if (StdErrc == static_cast<int>(errc::invalid)) {
+      Success = true;
+    }
+  }
+
+  Graph.end_recording();
+  assert(Success);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
new file mode 100644
index 0000000000000..8ad6a413aea03
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/add_nodes_after_finalize.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
new file mode 100644
index 0000000000000..6191a875bbe41
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
new file mode 100644
index 0000000000000..c7adb7f282da4
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_usm.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
new file mode 100644
index 0000000000000..79e53ff4ba9d9
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_usm_host.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
new file mode 100644
index 0000000000000..fa5a2a1f018e6
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_usm_mixed.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
new file mode 100644
index 0000000000000..1b7447940e1fe
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_usm_shared.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
new file mode 100644
index 0000000000000..26e5473bded66
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/basic_usm_system.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp
new file mode 100644
index 0000000000000..3c291d4d44393
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp
new file mode 100644
index 0000000000000..446d75316e6e2
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_2d.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp
new file mode 100644
index 0000000000000..8c233ec8de66e
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_host2target.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp
new file mode 100644
index 0000000000000..9c33e885ce8a5
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_host2target_2d.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp
new file mode 100644
index 0000000000000..2c26c24744f0e
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_host2target_offset.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp
new file mode 100644
index 0000000000000..746b41f4e0a76
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_offsets.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp
new file mode 100644
index 0000000000000..e3a9ceb3160a2
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_target2host.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp
new file mode 100644
index 0000000000000..f9945ebf3ee58
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_target2host_2d.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp
new file mode 100644
index 0000000000000..c51b9e445137c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_copy_target2host_offset.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
new file mode 100644
index 0000000000000..2c2edd374febc
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/buffer_ordering.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/depends_on.cpp b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
new file mode 100644
index 0000000000000..6454a0eeeaa0e
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
@@ -0,0 +1,49 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests that an event returned from adding a graph node using the queue
+// recording API can be passed to `handler::depends_on` inside a node
+// added using the explicit API. This should create a graph edge.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_device<float>(N, Queue);
+
+  Graph.begin_recording(Queue);
+  // `Event` corresponds to a graph node
+  event Event = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Arr[idx] = 42.0f; });
+  });
+  Graph.end_recording(Queue);
+
+  Graph.add([&](handler &CGH) {
+    CGH.depends_on(Event); // creates edge to recorded graph node
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Arr[idx] *= 2.0f; });
+  });
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  constexpr float ref = 42.0f * 2.0f;
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == ref);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp
new file mode 100644
index 0000000000000..da9bdc71ba466
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as reduction support is not complete.
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/dotp_buffer_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp
new file mode 100644
index 0000000000000..c54477e61dac6
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as reduction support is not complete.
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/dotp_usm_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/double_buffer.cpp b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp
new file mode 100644
index 0000000000000..94e09289577ff
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update isn't implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/double_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
new file mode 100644
index 0000000000000..687a25b923d78
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/empty_node.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
new file mode 100644
index 0000000000000..6948090873181
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
@@ -0,0 +1,89 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Test submitting the same graph twice with another command in between, this
+// intermediate command depends on the first submission of the graph, and
+// is a dependency of the second submission of the graph.
+
+#include "../graph_common.hpp"
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_shared<float>(N, Queue);
+
+  // Buffer elements set to 0.5
+  auto E1 = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] = 0.5f;
+    });
+  });
+
+  Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] += 0.25f;
+    });
+  });
+
+  // Buffer elements set to 1.5
+  auto E2 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(E1);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] += 1.0f;
+    });
+  });
+
+  auto ExecGraph = Graph.finalize();
+
+  // Buffer elements set to 3.0
+  auto E3 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(E2);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] *= 2.0f;
+    });
+  });
+
+  // Buffer elements set to 3.25
+  auto E4 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(E3);
+    CGH.ext_oneapi_graph(ExecGraph);
+  });
+
+  // Buffer elements set to 6.5
+  auto E5 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(E4);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] *= 2.0f;
+    });
+  });
+
+  // Buffer elements set to 6.75
+  Queue.submit([&](handler &CGH) {
+    CGH.depends_on(E5);
+    CGH.ext_oneapi_graph(ExecGraph);
+  });
+
+  Queue.wait();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Arr[i] == 6.75f);
+  }
+
+  // Free the allocated memory
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp
new file mode 100644
index 0000000000000..4d5831d494aa1
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp
@@ -0,0 +1,9 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out 2>&1 | FileCheck %s
+//
+// CHECK: complete
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/event_status_querying.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp
new file mode 100644
index 0000000000000..bcb9eeed01071
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update not implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/executable_graph_update.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp
new file mode 100644
index 0000000000000..dacd28e730139
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp
@@ -0,0 +1,15 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update and host tasks both aren't
+// implemented.
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/executable_graph_update_ordering"
diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp
new file mode 100644
index 0000000000000..1f6b7d931f194
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as host tasks aren't implemented yet.
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/host_task.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
new file mode 100644
index 0000000000000..a414e3f4b8d6c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/multiple_exec_graphs.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
new file mode 100644
index 0000000000000..233013ba5cf8f
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
@@ -0,0 +1,51 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests that graph.make_edge() correctly defines the dependency between two
+// nodes.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+
+  auto Init = Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] = 2.0f; });
+  });
+
+  auto Add = Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] += 2.0f; });
+  });
+
+  auto Mult = Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] *= 3.0f; });
+  });
+
+  Graph.make_edge(Init, Mult);
+  Graph.make_edge(Mult, Add);
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), X, N * sizeof(float)).wait();
+
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 8.0f);
+
+  sycl::free(X, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
new file mode 100644
index 0000000000000..68e89093c15b1
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/queue_shortcuts.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
new file mode 100644
index 0000000000000..305831a6abb9b
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/repeated_exec.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/single_node.cpp b/sycl/test-e2e/Graph/Explicit/single_node.cpp
new file mode 100644
index 0000000000000..766f6f08de281
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/single_node.cpp
@@ -0,0 +1,52 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests adding a node to the graph with explicit API works as expected.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_device<float>(N, Queue);
+
+  float ZeroPattern = 0.0f;
+  Queue.fill(Arr, ZeroPattern, N).wait();
+
+  Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] = 3.14f;
+    });
+  });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 0);
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 0);
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 3.14f);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Explicit/stream.cpp b/sycl/test-e2e/Graph/Explicit/stream.cpp
new file mode 100644
index 0000000000000..5c3ba8764ff51
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/stream.cpp
@@ -0,0 +1,30 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as sycl streams aren't implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/stream.cpp"
+
+// CHECK-DAG: Val: 1
+// CHECK-DAG: Val: 2
+// CHECK-DAG: Val: 3
+// CHECK-DAG: Val: 4
+// CHECK-DAG: Val: 5
+// CHECK-DAG: Val: 6
+// CHECK-DAG: Val: 7
+// CHECK-DAG: Val: 8
+// CHECK-DAG: Val: 9
+// CHECK-DAG: Val: 10
+// CHECK-DAG: Val: 11
+// CHECK-DAG: Val: 12
+// CHECK-DAG: Val: 13
+// CHECK-DAG: Val: 14
+// CHECK-DAG: Val: 15
+// CHECK-DAG: Val: 16
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
new file mode 100644
index 0000000000000..154ea4e3470e3
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
new file mode 100644
index 0000000000000..edce73a46ad73
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph_execute_without_parent.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
new file mode 100644
index 0000000000000..14e447c04104f
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// XFAIL:*
+// Submit a graph as a subgraph more than once doesn't yet work.
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph_multiple_submission.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
new file mode 100644
index 0000000000000..fe906bb7aba14
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph_nested.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp
new file mode 100644
index 0000000000000..73c1ddc5d520c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as reduction support is not complete.
+// XFAIL: *
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
new file mode 100644
index 0000000000000..cbf768203ec5a
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// XFAIL: *
+// Subgraph doesn't work properly in second parent graph
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/sub_graph_two_parent_graphs.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
new file mode 100644
index 0000000000000..d5b3ff7412b61
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/temp_buffer_reinterpret.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
new file mode 100644
index 0000000000000..e0771a3e6d082
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/usm_copy.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
new file mode 100644
index 0000000000000..463cc77b09871
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/usm_fill.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp
new file mode 100644
index 0000000000000..bf53345453a4b
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/usm_fill_host.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
new file mode 100644
index 0000000000000..134dda9aaff6c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/usm_fill_shared.cpp"
diff --git a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp
new file mode 100644
index 0000000000000..527beb526a177
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp
@@ -0,0 +1,90 @@
+// This test adds a new node to an already finalized
+// modifiable graph, before finalizing and executing the graph for a second
+// time.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = unsigned int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size), DataOut(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+  std::iota(DataOut.begin(), DataOut.end(), 1000);
+
+  std::vector<T> ReferenceC(DataC);
+  std::vector<T> ReferenceOut(DataOut);
+  for (unsigned n = 0; n < Iterations * 2; n++) {
+    for (size_t i = 0; i < Size; i++) {
+      ReferenceC[i] += (DataA[i] + DataB[i]);
+      if (n >= Iterations)
+        ReferenceOut[i] += ReferenceC[i] + 1;
+    }
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+  T *PtrOut = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.copy(DataOut.data(), PtrOut, Size);
+  Queue.wait_and_throw();
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> id) { PtrC[id] += PtrA[id] + PtrB[id]; });
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeA);
+        CGH.parallel_for(range<1>(Size),
+                         [=](item<1> id) { PtrOut[id] += PtrC[id] + 1; });
+      },
+      NodeA);
+
+  auto GraphExecAdditional = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExecAdditional);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.copy(PtrOut, DataOut.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+  free(PtrOut, Queue);
+
+  assert(ReferenceC == DataC);
+  assert(ReferenceOut == DataOut);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp
new file mode 100644
index 0000000000000..0d34e3f51a822
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp
@@ -0,0 +1,56 @@
+// Tests adding nodes to a graph and submitting the graph
+// using buffers accessors for inputs and outputs.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = unsigned short;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  {
+    exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+    buffer<T> BufferA{DataA.data(), range<1>{DataA.size()}};
+    BufferA.set_write_back(false);
+    buffer<T> BufferB{DataB.data(), range<1>{DataB.size()}};
+    BufferB.set_write_back(false);
+    buffer<T> BufferC{DataC.data(), range<1>{DataC.size()}};
+    BufferC.set_write_back(false);
+
+    // Add commands to graph
+    add_nodes(Graph, Queue, Size, BufferA, BufferB, BufferC);
+
+    auto GraphExec = Graph.finalize();
+
+    event Event;
+    for (unsigned n = 0; n < Iterations; n++) {
+      Event = Queue.submit([&](handler &CGH) {
+        CGH.depends_on(Event);
+        CGH.ext_oneapi_graph(GraphExec);
+      });
+    }
+    Queue.wait_and_throw();
+
+    host_accessor HostAccA(BufferA);
+    host_accessor HostAccB(BufferB);
+    host_accessor HostAccC(BufferC);
+
+    for (size_t i = 0; i < Size; i++) {
+      assert(ReferenceA[i] == HostAccA[i]);
+      assert(ReferenceB[i] == HostAccB[i]);
+      assert(ReferenceC[i] == HostAccC[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp
new file mode 100644
index 0000000000000..5a8c9291ff0ef
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp
@@ -0,0 +1,61 @@
+// Tests basic adding of nodes with USM pointers,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp
new file mode 100644
index 0000000000000..57f9ae372f627
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp
@@ -0,0 +1,65 @@
+// Tests basic adding of nodes with USM host pointers,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) {
+    return 0;
+  }
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_host<T>(Size, Queue);
+  T *PtrB = malloc_host<T>(Size, Queue);
+  T *PtrC = malloc_host<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp
new file mode 100644
index 0000000000000..6ee9c7f2030e3
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp
@@ -0,0 +1,68 @@
+// Tests basic adding of nodes with mixed USM pointers,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+  if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) {
+    return 0;
+  }
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_shared<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_host<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp
new file mode 100644
index 0000000000000..71aa234752bb4
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp
@@ -0,0 +1,65 @@
+// Tests basic adding of nodes with USM shared pointers,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_shared<T>(Size, Queue);
+  T *PtrB = malloc_shared<T>(Size, Queue);
+  T *PtrC = malloc_shared<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp
new file mode 100644
index 0000000000000..f29c0d3afe69c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp
@@ -0,0 +1,65 @@
+// Tests basic adding of nodes with USM system allocations,
+// and submission of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_system_allocations)) {
+    return 0;
+  }
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = new T[Size];
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to graph
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  delete[] PtrC;
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp
new file mode 100644
index 0000000000000..56623b53b2d36
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp
@@ -0,0 +1,123 @@
+// Tests adding a buffer copy node and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (unsigned i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceA[j] = ReferenceB[j];
+      ReferenceA[j] += ModValue;
+      ReferenceB[j] = ReferenceA[j];
+      ReferenceB[j] += ModValue;
+      ReferenceC[j] = ReferenceB[j];
+    }
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer BufferA{DataA};
+  BufferA.set_write_back(false);
+  buffer BufferB{DataB};
+  BufferB.set_write_back(false);
+  buffer BufferC{DataC};
+  BufferC.set_write_back(false);
+
+  // Copy from B to A
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    auto AccB = BufferB.get_access(CGH);
+    CGH.copy(AccB, AccA);
+  });
+
+  // Read & write A
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccA = BufferA.get_access(CGH);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          AccA[LinID] += ModValue;
+        });
+      },
+      NodeA);
+
+  // Read & write B
+  auto NodeModB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          AccB[LinID] += ModValue;
+        });
+      },
+      NodeA);
+
+  // memcpy from A to B
+  auto NodeC = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccA = BufferA.get_access(CGH);
+        auto AccB = BufferB.get_access(CGH);
+        CGH.copy(AccA, AccB);
+      },
+      NodeB, NodeModB);
+
+  // Read and write B
+  auto NodeD = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          AccB[LinID] += ModValue;
+        });
+      },
+      NodeC);
+
+  // Copy from B to C
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        auto AccC = BufferC.get_access(CGH);
+        CGH.copy(AccB, AccC);
+      },
+      NodeD);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  host_accessor HostAccA(BufferA);
+  host_accessor HostAccB(BufferB);
+  host_accessor HostAccC(BufferC);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == HostAccA[i]);
+    assert(ReferenceB[i] == HostAccB[i]);
+    assert(ReferenceC[i] == HostAccC[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp
new file mode 100644
index 0000000000000..574cf9c84981c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp
@@ -0,0 +1,121 @@
+// Tests adding buffer 2d copy nodes and submitting
+// the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size * Size), DataB(Size * Size), DataC(Size * Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (unsigned i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size * Size; j++) {
+      ReferenceA[j] = ReferenceB[j];
+      ReferenceA[j] += ModValue;
+      ReferenceB[j] = ReferenceA[j];
+      ReferenceB[j] += ModValue;
+      ReferenceC[j] = ReferenceB[j];
+    }
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  // Make the buffers 2D so we can test the rect copy path
+  buffer BufferA{DataA.data(), range<2>(Size, Size)};
+  BufferA.set_write_back(false);
+  buffer BufferB{DataB.data(), range<2>(Size, Size)};
+  BufferB.set_write_back(false);
+  buffer BufferC{DataC.data(), range<2>(Size, Size)};
+  BufferC.set_write_back(false);
+
+  // Copy from B to A
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    auto AccB = BufferB.get_access(CGH);
+    CGH.copy(AccB, AccA);
+  });
+
+  // Read & write A
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccA = BufferA.get_access(CGH);
+        CGH.parallel_for(range<2>(Size, Size),
+                         [=](item<2> id) { AccA[id] += ModValue; });
+      },
+      NodeA);
+
+  // Read & write B
+  auto NodeModB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        CGH.parallel_for(range<2>(Size, Size),
+                         [=](item<2> id) { AccB[id] += ModValue; });
+      },
+      NodeA);
+
+  // memcpy from A to B
+  auto NodeC = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccA = BufferA.get_access(CGH);
+        auto AccB = BufferB.get_access(CGH);
+        CGH.copy(AccA, AccB);
+      },
+      NodeModB);
+
+  // Read and write B
+  auto NodeD = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        CGH.parallel_for(range<2>(Size, Size),
+                         [=](item<2> id) { AccB[id] += ModValue; });
+      },
+      NodeC);
+
+  // Copy from B to C
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        auto AccB = BufferB.get_access(CGH);
+        auto AccC = BufferC.get_access(CGH);
+        CGH.copy(AccB, AccC);
+      },
+      NodeD);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  host_accessor HostAccA(BufferA);
+  host_accessor HostAccB(BufferB);
+  host_accessor HostAccC(BufferC);
+
+  for (size_t i = 0; i < Size; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      assert(ReferenceA[i * Size + j] == HostAccA[i][j]);
+      assert(ReferenceB[i * Size + j] == HostAccB[i][j]);
+      assert(ReferenceC[i * Size + j] == HostAccC[i][j]);
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp
new file mode 100644
index 0000000000000..8a25673ea6645
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp
@@ -0,0 +1,40 @@
+// Tests adding buffer copy -- Host to Target (write path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(Size);
+  for (size_t i = 0; i < Size; i++) {
+    ReferenceA[i] = DataB[i];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer<T, 1> BufferA(DataA.data(), range<1>(Size));
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    CGH.copy(DataB.data(), AccA);
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  host_accessor HostAccA(BufferA);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == HostAccA[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp
new file mode 100644
index 0000000000000..9d581a9dbe76e
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp
@@ -0,0 +1,43 @@
+// Tests adding 2d buffer copy -- Host to Target (write path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size * Size), DataB(Size * Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA);
+  for (size_t i = 0; i < Size * Size; i++) {
+    ReferenceA[i] = DataB[i];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  // Make the buffers 2D so we can test the rect write path
+  buffer BufferA{DataA.data(), range<2>(Size, Size)};
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::write>(CGH);
+    CGH.copy(DataB.data(), AccA);
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  host_accessor HostAccA(BufferA);
+
+  for (size_t i = 0; i < Size; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      assert(ReferenceA[i * Size + j] == HostAccA[i][j]);
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp
new file mode 100644
index 0000000000000..4bde5d8a2fa55
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp
@@ -0,0 +1,44 @@
+// Tests adding buffer copy offset -- Host to Target (write path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size + Offset), DataB(Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(Size + Offset);
+  for (size_t i = 0; i < Size + Offset; i++) {
+    if (i < Offset)
+      ReferenceA[i] = DataA[i];
+    else
+      ReferenceA[i] = DataB[i - Offset];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer<T, 1> BufferA(DataA.data(), range<1>(Size + Offset));
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::write>(CGH, range<1>(Size),
+                                                        id<1>(Offset));
+    CGH.copy(DataB.data(), AccA);
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  host_accessor HostAccA(BufferA);
+
+  for (size_t i = 0; i < Size + Offset; i++) {
+    assert(ReferenceA[i] == HostAccA[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp
new file mode 100644
index 0000000000000..a52aae9220617
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp
@@ -0,0 +1,56 @@
+// Tests adding buffer copy nodes with offsets
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  size_t OffsetSrc = 2 * size_t(Size / 4);
+  size_t OffsetDst = size_t(Size / 4);
+  std::vector<T> DataA(Size), DataB(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
+  for (size_t j = 0; j < Size; j++) {
+    ReferenceA[j] = DataA[j];
+    ReferenceB[j] = DataB[j];
+  }
+  for (size_t j = OffsetDst; j < Size - (OffsetSrc - OffsetDst); j++) {
+    ReferenceB[j] = DataA[(j - OffsetDst) + OffsetSrc];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer BufferA{DataA};
+  BufferA.set_write_back(false);
+  buffer BufferB{DataB};
+  BufferB.set_write_back(false);
+
+  // Copy from A to B
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::read_write>(
+        CGH, range<1>(Size - OffsetSrc), id<1>(OffsetSrc));
+    auto AccB = BufferB.get_access<access::mode::read_write>(
+        CGH, range<1>(Size - OffsetDst), id<1>(OffsetDst));
+    CGH.copy(AccA, AccB);
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  host_accessor HostAccA(BufferA);
+  host_accessor HostAccB(BufferB);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == HostAccA[i]);
+    assert(ReferenceB[i] == HostAccB[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp
new file mode 100644
index 0000000000000..460ecd4ee945f
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp
@@ -0,0 +1,40 @@
+// Tests adding buffer copy -- Target to Host (read path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(Size), ReferenceB(Size);
+  for (size_t i = 0; i < Size; i++) {
+    ReferenceA[i] = DataA[i];
+    ReferenceB[i] = DataA[i];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer<T, 1> BufferA(DataA.data(), range<1>(Size));
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::read>(CGH);
+    CGH.copy(AccA, DataB.data());
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == DataA[i]);
+    assert(ReferenceB[i] == DataB[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp
new file mode 100644
index 0000000000000..69050d2a8a1c6
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp
@@ -0,0 +1,43 @@
+// Tests adding 2d buffer copy -- Target to Host (rect read path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size * Size), DataB(Size * Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB);
+  for (size_t i = 0; i < Size * Size; i++) {
+    ReferenceA[i] = DataA[i];
+    ReferenceB[i] = DataA[i];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  // Make the buffers 2D so we can test the rect read path
+  buffer BufferA{DataA.data(), range<2>(Size, Size)};
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::read>(CGH);
+    CGH.copy(AccA, DataB.data());
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  host_accessor HostAccA(BufferA);
+
+  for (size_t i = 0; i < Size * Size; i++) {
+    assert(ReferenceA[i] == DataA[i]);
+    assert(ReferenceB[i] == DataB[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp
new file mode 100644
index 0000000000000..dcb6d290b1205
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp
@@ -0,0 +1,44 @@
+// Tests adding buffer copy with offset -- Target to Host (read path) --  nodes
+// and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size);
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 1000);
+
+  std::vector<T> ReferenceA(Size), ReferenceB(Size);
+  for (size_t i = 0; i < Size; i++) {
+    ReferenceA[i] = DataA[i];
+    if (i < (Size - Offset))
+      ReferenceB[i] = DataA[i + Offset];
+    else
+      ReferenceB[i] = DataB[i];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer<T, 1> BufferA(DataA.data(), range<1>(Size));
+  BufferA.set_write_back(false);
+
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access<access::mode::read>(
+        CGH, range<1>(Size - Offset), id<1>(Offset));
+    CGH.copy(AccA, DataB.data());
+  });
+
+  auto GraphExec = Graph.finalize();
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait();
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == DataA[i]);
+    assert(ReferenceB[i] == DataB[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp
new file mode 100644
index 0000000000000..79305c69db52c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp
@@ -0,0 +1,97 @@
+// Tests that buffer accessors exhibit the correct behaviour when:
+// * A node is added to the graph between two queue submissions which
+//   use the same buffer, but are not added to the graph.
+//
+// * A queue submission using the same buffer is made after finalization
+//   of the graph, but before graph execution.
+//
+// * The graph is submitted for execution twice separated by a queue
+//   submission using the same buffer, this should respect dependencies and
+//   create the correct ordering.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  std::vector<float> Arr(N, 0.0f);
+
+  buffer<float> Buf{N};
+  Buf.set_write_back(false);
+
+  // Buffer elements set to 0.5
+  Queue.submit([&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Acc[i] = 0.5f;
+    });
+  });
+
+  add_node(Graph, Queue, [&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Acc[i] += 0.25f;
+    });
+  });
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Arr[i] == 0.0f);
+  }
+
+  // Buffer elements set to 1.5
+  Queue.submit([&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Acc[i] += 1.0f;
+    });
+  });
+
+  auto ExecGraph = Graph.finalize();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Arr[i] == 0.0f);
+  }
+
+  // Buffer elements set to 3.0
+  Queue.submit([&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Acc[i] *= 2.0f;
+    });
+  });
+
+  // Buffer elements set to 3.25
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  // Buffer elements set to 6.5
+  Queue.submit([&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Acc[i] *= 2.0f;
+    });
+  });
+
+  // Buffer elements set to 6.75
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  Queue.submit([&](handler &CGH) {
+    auto Acc = Buf.get_access(CGH);
+    CGH.copy(Acc, Arr.data());
+  });
+  Queue.wait();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Arr[i] == 6.75f);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp
new file mode 100644
index 0000000000000..9e64a0bdae5d8
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp
@@ -0,0 +1,84 @@
+// Tests creating a dotp operation which uses a sycl reduction with buffers.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  float DotpData = 0.f;
+
+  const size_t N = 10;
+  std::vector<float> XData(N);
+  std::vector<float> YData(N);
+  std::vector<float> ZData(N);
+
+  {
+    buffer DotpBuf(&DotpData, range<1>(1));
+    DotpBuf.set_write_back(false);
+
+    buffer XBuf(XData);
+    XBuf.set_write_back(false);
+    buffer YBuf(YData);
+    YBuf.set_write_back(false);
+    buffer ZBuf(ZData);
+    ZBuf.set_write_back(false);
+
+    auto NodeI = add_node(Graph, Queue, [&](handler &CGH) {
+      auto X = XBuf.get_access(CGH);
+      auto Y = YBuf.get_access(CGH);
+      auto Z = ZBuf.get_access(CGH);
+      CGH.parallel_for(N, [=](id<1> it) {
+        X[it] = 1.0f;
+        Y[it] = 2.0f;
+        Z[it] = 3.0f;
+      });
+    });
+
+    auto NodeA = add_node(
+        Graph, Queue,
+        [&](handler &CGH) {
+          auto X = XBuf.get_access(CGH);
+          auto Y = YBuf.get_access(CGH);
+          CGH.parallel_for(range<1>{N}, [=](id<1> it) {
+            X[it] = Alpha * X[it] + Beta * Y[it];
+          });
+        },
+        NodeI);
+
+    auto NodeB = add_node(
+        Graph, Queue,
+        [&](handler &CGH) {
+          auto Y = YBuf.get_access(CGH);
+          auto Z = ZBuf.get_access(CGH);
+          CGH.parallel_for(range<1>{N}, [=](id<1> it) {
+            Z[it] = Gamma * Z[it] + Beta * Y[it];
+          });
+        },
+        NodeI);
+
+    auto NodeC = add_node(
+        Graph, Queue,
+        [&](handler &CGH) {
+          auto Dotp = DotpBuf.get_access(CGH);
+          auto X = XBuf.get_access(CGH);
+          auto Z = ZBuf.get_access(CGH);
+          CGH.parallel_for(range<1>{N},
+                           reduction(DotpBuf, CGH, 0.0f, std::plus()),
+                           [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; });
+        },
+        NodeA, NodeB);
+
+    auto ExecGraph = Graph.finalize();
+
+    // Using shortcut for executing a graph of commands
+    Queue.ext_oneapi_graph(ExecGraph).wait();
+
+    host_accessor HostAcc(DotpBuf);
+    assert(HostAcc[0] == dotp_reference_result(N));
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp
new file mode 100644
index 0000000000000..50918b0fe9e8f
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp
@@ -0,0 +1,71 @@
+// Tests constructing a graph using the explicit API to perform a dotp
+// operation which uses a sycl reduction with USM memory.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  float *Dotp = malloc_device<float>(1, Queue);
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  auto NodeI = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+
+  auto NodeA = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeI);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) {
+          X[it] = Alpha * X[it] + Beta * Y[it];
+        });
+      },
+      NodeI);
+
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeI);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) {
+          Z[it] = Gamma * Z[it] + Beta * Y[it];
+        });
+      },
+      NodeI);
+
+  auto NodeC = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {NodeA, NodeB});
+        CGH.parallel_for(range<1>{N}, reduction(Dotp, 0.0f, std::plus()),
+                         [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; });
+      },
+      NodeA, NodeB);
+
+  auto ExecGraph = Graph.finalize();
+
+  // Using shortcut for executing a graph of commands
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  float Output;
+  Queue.memcpy(&Output, Dotp, sizeof(float)).wait();
+
+  assert(Output == dotp_reference_result(N));
+
+  sycl::free(Dotp, Queue);
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/double_buffer.cpp b/sycl/test-e2e/Graph/Inputs/double_buffer.cpp
new file mode 100644
index 0000000000000..69ae653d1ab3d
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/double_buffer.cpp
@@ -0,0 +1,102 @@
+// Tests executable graph update by creating a double buffering scenario, where
+// a single graph is repeatedly executed then updated to swap between two sets
+// of buffers.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+  std::vector<T> DataA2(Size), DataB2(Size), DataC2(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::iota(DataA2.begin(), DataA2.end(), 3);
+  std::iota(DataB2.begin(), DataB2.end(), 13);
+  std::iota(DataC2.begin(), DataC2.end(), 1333);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  std::vector<T> ReferenceA2(DataA2), ReferenceB2(DataB2), ReferenceC2(DataC2);
+
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+  calculate_reference_data(Iterations, Size, ReferenceA2, ReferenceB2,
+                           ReferenceC2);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  T *PtrA2 = malloc_device<T>(Size, Queue);
+  T *PtrB2 = malloc_device<T>(Size, Queue);
+  T *PtrC2 = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+
+  Queue.copy(DataA2.data(), PtrA, Size);
+  Queue.copy(DataB2.data(), PtrB, Size);
+  Queue.copy(DataC2.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto ExecGraph = Graph.finalize();
+
+  // Create second graph using other buffer set
+  exp_ext::command_graph GraphUpdate{Queue.get_context(), Queue.get_device()};
+  add_nodes(GraphUpdate, Queue, Size, PtrA, PtrB, PtrC);
+
+  event Event;
+  for (unsigned i = 0; i < Iterations; i++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(ExecGraph);
+    });
+    // Update to second set of buffers
+    ExecGraph.update(GraphUpdate);
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(ExecGraph);
+    });
+    // Reset back to original buffers
+    ExecGraph.update(Graph);
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+
+  Queue.copy(PtrA2, DataA2.data(), Size);
+  Queue.copy(PtrB2, DataB2.data(), Size);
+  Queue.copy(PtrC2, DataC2.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  free(PtrA2, Queue);
+  free(PtrB2, Queue);
+  free(PtrC2, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  assert(ReferenceA2 == DataA2);
+  assert(ReferenceB2 == DataB2);
+  assert(ReferenceC2 == DataC2);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/empty_node.cpp b/sycl/test-e2e/Graph/Inputs/empty_node.cpp
new file mode 100644
index 0000000000000..c0b1dd7440fa7
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/empty_node.cpp
@@ -0,0 +1,59 @@
+// Tests the interface for adding empty nodes and creating dependencies on those
+// empty nodes.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  auto MyProperties = property_list{exp_ext::property::graph::no_cycle_check()};
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device(),
+                               MyProperties};
+
+  const size_t N = 10;
+  float *Arr = malloc_device<float>(N, Queue);
+
+  auto Start = add_empty_node(Graph, Queue);
+
+  auto Init = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, Start);
+        CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+          size_t i = idx;
+          Arr[i] = 0;
+        });
+      },
+      Start);
+
+  auto Empty = add_empty_node(Graph, Queue, Init);
+  auto Empty2 = add_empty_node(Graph, Queue, Empty);
+  auto Empty3 = add_node(
+      Graph, Queue, [&](handler &CGH) { depends_on_helper(CGH, Empty2); },
+      Empty2);
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, Empty2);
+        CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+          size_t i = idx;
+          Arr[i] = 1;
+        });
+      },
+      Empty2);
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  std::vector<float> HostData(N);
+  Queue.memcpy(HostData.data(), Arr, N * sizeof(float)).wait();
+
+  for (int i = 0; i < N; i++)
+    assert(HostData[i] == 1.f);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp
new file mode 100644
index 0000000000000..04b5820a895b7
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp
@@ -0,0 +1,140 @@
+// This test checks the querying of the state of an event
+// returned from graph submission
+// with event::get_info<info::event::command_execution_status>()
+// An event should pass from the submitted state to the complete state.
+// The running state seems to not be implemented by the level_zero backend.
+// This test should display (in most execution environment):
+// -----
+// submitted
+// complete
+// -----
+// However, the execution support may be fast enough to complete
+// the computation before we reach the state monitoring query.
+// In this case, the displayed output can be:
+// -----
+// complete
+// complete
+// -----
+// We therefore only check that the complete state of the event
+// in this test.
+
+#include "../graph_common.hpp"
+
+std::string event_status_name(sycl::info::event_command_status status) {
+  switch (status) {
+  case sycl::info::event_command_status::submitted:
+    return "submitted";
+  case sycl::info::event_command_status::running:
+    return "running";
+  case sycl::info::event_command_status::complete:
+    return "complete";
+  default:
+    return "unknown (" + std::to_string(int(status)) + ")";
+  }
+}
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (size_t j = 0; j < Size; j++) {
+    ReferenceA[j] = ReferenceB[j];
+    ReferenceA[j] += ModValue;
+    ReferenceB[j] = ReferenceA[j];
+    ReferenceB[j] += ModValue;
+    ReferenceC[j] = ReferenceB[j];
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer BufferA{DataA};
+  BufferA.set_write_back(false);
+  buffer BufferB{DataB};
+  BufferB.set_write_back(false);
+  buffer BufferC{DataC};
+  BufferC.set_write_back(false);
+
+  // Copy from B to A
+  auto Init = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    auto AccB = BufferB.get_access(CGH);
+    CGH.copy(AccB, AccA);
+  });
+
+  // Read & write A
+  auto Node1 = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      AccA[LinID] += ModValue;
+    });
+  });
+
+  // Read & write B
+  auto Node2 = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccB = BufferB.get_access(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      AccB[LinID] += ModValue;
+    });
+  });
+
+  // memcpy from A to B
+  auto Node3 = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccA = BufferA.get_access(CGH);
+    auto AccB = BufferB.get_access(CGH);
+    CGH.copy(AccA, AccB);
+  });
+
+  // Read and write B
+  auto Node4 = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccB = BufferB.get_access(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      AccB[LinID] += ModValue;
+    });
+  });
+
+  // Copy from B to C
+  auto Node5 = add_node(Graph, Queue, [&](handler &CGH) {
+    auto AccB = BufferB.get_access(CGH);
+    auto AccC = BufferC.get_access(CGH);
+    CGH.copy(AccB, AccC);
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  sycl::event Event =
+      Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  auto Info = Event.get_info<info::event::command_execution_status>();
+  std::cout << event_status_name(Info) << std::endl;
+  while (
+      (Info = Event.get_info<sycl::info::event::command_execution_status>()) !=
+      sycl::info::event_command_status::complete) {
+  }
+  std::cout << event_status_name(Info) << std::endl;
+
+  Queue.wait_and_throw();
+
+  host_accessor HostAccA(BufferA);
+  host_accessor HostAccB(BufferB);
+  host_accessor HostAccC(BufferC);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(ReferenceA[i] == HostAccA[i]);
+    assert(ReferenceB[i] == HostAccB[i]);
+    assert(ReferenceC[i] == HostAccC[i]);
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp b/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp
new file mode 100644
index 0000000000000..afd6106a2d6a7
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp
@@ -0,0 +1,101 @@
+// Tests executable graph update by creating two graphs with USM ptrs and
+// attempting to update one from the other.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  auto DataA2 = DataA;
+  auto DataB2 = DataB;
+  auto DataC2 = DataC;
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to first graph
+  add_nodes(GraphA, Queue, Size, PtrA, PtrB, PtrC);
+  auto GraphExec = GraphA.finalize();
+
+  exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA2 = malloc_device<T>(Size, Queue);
+  T *PtrB2 = malloc_device<T>(Size, Queue);
+  T *PtrC2 = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA2.data(), PtrA2, Size);
+  Queue.copy(DataB2.data(), PtrB2, Size);
+  Queue.copy(DataC2.data(), PtrC2, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to second graph
+  add_nodes(GraphB, Queue, Size, PtrA2, PtrB2, PtrC2);
+
+  // Execute several Iterations of the graph for 1st set of buffers
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  GraphExec.update(GraphB);
+
+  // Execute several Iterations of the graph for 2nd set of buffers
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+
+  Queue.copy(PtrA2, DataA2.data(), Size);
+  Queue.copy(PtrB2, DataB2.data(), Size);
+  Queue.copy(PtrC2, DataC2.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  free(PtrA2, Queue);
+  free(PtrB2, Queue);
+  free(PtrC2, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  assert(ReferenceA == DataA2);
+  assert(ReferenceB == DataB2);
+  assert(ReferenceC == DataC2);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp
new file mode 100644
index 0000000000000..a358bd0359cb4
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp
@@ -0,0 +1,139 @@
+// Tests executable graph update by introducing a delay in to the update
+// transactions dependencies to check correctness of behaviour.
+
+#include "../graph_common.hpp"
+#include <thread>
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+  std::vector<T> HostTaskOutput(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  auto DataA2 = DataA;
+  auto DataB2 = DataB;
+  auto DataC2 = DataC;
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_shared<T>(Size, Queue);
+  T *PtrB = malloc_shared<T>(Size, Queue);
+  T *PtrC = malloc_shared<T>(Size, Queue);
+  T *PtrOut = malloc_shared<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Add commands to first graph
+  auto NodeA = add_nodes(GraphA, Queue, Size, PtrA, PtrB, PtrC);
+
+  // host task to induce a wait for dependencies
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        CGH.host_task([=]() {
+          for (size_t i = 0; i < Size; i++) {
+            PtrOut[i] = PtrC[i];
+          }
+          std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        });
+      },
+      NodeA);
+
+  auto GraphExec = GraphA.finalize();
+
+  exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA2 = malloc_shared<T>(Size, Queue);
+  T *PtrB2 = malloc_shared<T>(Size, Queue);
+  T *PtrC2 = malloc_shared<T>(Size, Queue);
+
+  Queue.copy(DataA2.data(), PtrA2, Size);
+  Queue.copy(DataB2.data(), PtrB2, Size);
+  Queue.copy(DataC2.data(), PtrC2, Size);
+  Queue.wait_and_throw();
+
+  // Adds commands to second graph
+  auto NodeB = add_nodes(GraphB, Queue, Size, PtrA2, PtrB2, PtrC2);
+
+  // host task to match the graph topology, but we don't need to sleep this
+  // time because there is no following update.
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        // This should be access::target::host_task but it has not been
+        // implemented yet.
+        CGH.host_task([=]() {
+          for (size_t i = 0; i < Size; i++) {
+            PtrOut[i] = PtrC2[i];
+          }
+        });
+      },
+      NodeB);
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  GraphExec.update(GraphB);
+
+  // Execute several Iterations of the graph for 2nd set of buffers
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.copy(PtrOut, HostTaskOutput.data(), Size);
+
+  Queue.copy(PtrA2, DataA.data(), Size);
+  Queue.copy(PtrB2, DataB.data(), Size);
+  Queue.copy(PtrC2, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+  free(PtrOut, Queue);
+
+  free(PtrA2, Queue);
+  free(PtrB2, Queue);
+  free(PtrC2, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+  assert(ReferenceC == HostTaskOutput);
+
+  assert(ReferenceA == DataA2);
+  assert(ReferenceB == DataB2);
+  assert(ReferenceC == DataC2);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/host_task.cpp b/sycl/test-e2e/Graph/Inputs/host_task.cpp
new file mode 100644
index 0000000000000..46917ad0ac241
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/host_task.cpp
@@ -0,0 +1,88 @@
+// This test uses a host_task when adding a command_graph node.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  const T ModValue = T{7};
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> Reference(DataC);
+  for (unsigned n = 0; n < Iterations; n++) {
+    for (size_t i = 0; i < Size; i++) {
+      Reference[i] += (DataA[i] + DataB[i]) + ModValue + 1;
+    }
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_shared<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Vector add to output
+  auto NodeA = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> id) { PtrC[id] += PtrA[id] + PtrB[id]; });
+  });
+
+  // Modify the output values in a host_task
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeA);
+        CGH.host_task([=]() {
+          for (size_t i = 0; i < Size; i++) {
+            PtrC[i] += ModValue;
+          }
+        });
+      },
+      NodeA);
+
+  // Modify temp buffer and write to output buffer
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeB);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrC[id] += 1; });
+      },
+      NodeB);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(Reference == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp
new file mode 100644
index 0000000000000..8bdf3caa1415d
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp
@@ -0,0 +1,62 @@
+// This test attempts creating multiple executable graphs from one modifiable
+// graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // event Event = add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+  // Queue.wait_and_throw();
+
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  // Finalize and execute several iterations of the graph
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    auto GraphExec = Graph.finalize();
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp
new file mode 100644
index 0000000000000..cda06fa461049
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp
@@ -0,0 +1,65 @@
+// Tests queue shortcuts for executing a graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  buffer<T> BufferA{DataA.data(), range<1>{DataA.size()}};
+  buffer<T> BufferB{DataB.data(), range<1>{DataB.size()}};
+  buffer<T> BufferC{DataC.data(), range<1>{DataC.size()}};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC);
+
+  auto GraphExec = Graph.finalize();
+
+  // Execute several iterations of the graph using the different shortcuts
+  event Event = Queue.ext_oneapi_graph(GraphExec);
+
+  assert(Iterations > 2);
+  const size_t LoopIterations = Iterations - 2;
+  std::vector<event> Events(LoopIterations);
+  for (unsigned n = 0; n < LoopIterations; n++) {
+    Events[n] = Queue.ext_oneapi_graph(GraphExec, Event);
+  }
+
+  Queue.ext_oneapi_graph(GraphExec, Events).wait();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp b/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp
new file mode 100644
index 0000000000000..8a6b1cab23bc3
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp
@@ -0,0 +1,49 @@
+// Test executing a graph multiple times.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_device<float>(N, Queue);
+  float ZeroPattern = 0.0f;
+  Queue.fill(Arr, ZeroPattern, N).wait();
+
+  add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Arr[i] += 1;
+    });
+  });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 0);
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 0);
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 1);
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == 2);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/stream.cpp b/sycl/test-e2e/Graph/Inputs/stream.cpp
new file mode 100644
index 0000000000000..fb3f347a71598
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/stream.cpp
@@ -0,0 +1,39 @@
+// This test checks that we can use a stream when explicitly adding a
+// command_graph node.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  size_t WorkItems = 16;
+  std::vector<T> DataIn(WorkItems);
+
+  std::iota(DataIn.begin(), DataIn.end(), 1);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrIn = malloc_device<T>(WorkItems, Queue);
+  Queue.copy(DataIn.data(), PtrIn, WorkItems);
+
+  add_node(Graph, Queue, [&](handler &CGH) {
+    sycl::stream Out(WorkItems * 16, 16, CGH);
+    CGH.parallel_for(range<1>(WorkItems), [=](item<1> id) {
+      Out << "Val: " << PtrIn[id.get_linear_id()] << sycl::endl;
+    });
+  });
+
+  auto GraphExec = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrIn, DataIn.data(), Size);
+
+  free(PtrIn, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp
new file mode 100644
index 0000000000000..21638b443b368
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp
@@ -0,0 +1,125 @@
+// This test creates a graph, finalizes it, then submits that as a subgraph of
+// another graph, and executes that second graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = short;
+
+  // Values used to modify data inside kernels.
+  const int ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size), DataOut(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+  std::iota(DataOut.begin(), DataOut.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA);
+  std::vector<T> ReferenceB(DataB);
+  std::vector<T> ReferenceC(DataC);
+  std::vector<T> ReferenceOut(DataOut);
+  for (unsigned n = 0; n < Iterations; n++) {
+    for (size_t i = 0; i < Size; i++) {
+      ReferenceA[i] += ModValue;
+      ReferenceB[i] += ModValue;
+      ReferenceC[i] = (ReferenceA[i] + ReferenceB[i]);
+      ReferenceC[i] -= ModValue;
+      ReferenceOut[i] = ReferenceC[i] + ModValue;
+    }
+  }
+
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+  T *PtrOut = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.copy(DataOut.data(), PtrOut, Size);
+  Queue.wait_and_throw();
+
+  // Add some operations to a graph which will later be submitted as part
+  // of another graph.
+
+  // Vector add two values
+  auto NodeSubA = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> id) { PtrC[id] = PtrA[id] + PtrB[id]; });
+  });
+
+  // Modify the output value with some other value
+  add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeSubA);
+        CGH.parallel_for(range<1>(Size),
+                         [=](item<1> id) { PtrC[id] -= ModValue; });
+      },
+      NodeSubA);
+
+  auto SubGraphExec = SubGraph.finalize();
+
+  exp_ext::command_graph MainGraph{Queue.get_context(), Queue.get_device()};
+
+  // Modify the input values.
+  auto NodeMainA = add_node(MainGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      PtrA[id] += ModValue;
+      PtrB[id] += ModValue;
+    });
+  });
+
+  auto NodeMainB = add_node(
+      MainGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeMainA);
+        CGH.ext_oneapi_graph(SubGraphExec);
+      },
+      NodeMainA);
+
+  // Copy to another output buffer.
+  add_node(
+      MainGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeMainB);
+        CGH.parallel_for(range<1>(Size),
+                         [=](item<1> id) { PtrOut[id] = PtrC[id] + ModValue; });
+      },
+      NodeMainB);
+
+  // Finalize a graph with the additional kernel for writing out to
+  auto MainGraphExec = MainGraph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(MainGraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.copy(PtrOut, DataOut.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+  free(PtrOut, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+  assert(ReferenceOut == DataOut);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp
new file mode 100644
index 0000000000000..5a7dbbabfd9e1
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp
@@ -0,0 +1,76 @@
+// Tests creating a parent graph which contains a subgraph while also executing
+// the subgraph by itself.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+
+  auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 3.14f; });
+  });
+
+  add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, S1);
+        CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; });
+      },
+      S1);
+
+  auto ExecSubGraph = SubGraph.finalize();
+
+  auto G1 = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; });
+  });
+
+  auto G2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, G1);
+        CGH.ext_oneapi_graph(ExecSubGraph);
+      },
+      G1);
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, G2);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; });
+      },
+      G2);
+
+  auto ExecGraph = Graph.finalize();
+
+  auto Event1 = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.f; });
+  });
+
+  auto Event2 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(Event1);
+    CGH.ext_oneapi_graph(ExecSubGraph);
+  });
+
+  auto Event3 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(Event2);
+    CGH.ext_oneapi_graph(ExecGraph);
+  });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), X, N * sizeof(float), Event3).wait();
+
+  const float ref = ((1.f * 3.14f + 0.5f) * 2.0f * 3.14f + 0.5f) * -1.f;
+  for (size_t i = 0; i < N; i++) {
+    assert(Output[i] == ref);
+  }
+
+  sycl::free(X, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp
new file mode 100644
index 0000000000000..81ad495da5268
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp
@@ -0,0 +1,71 @@
+// Tests creating a parent graph with multiple submissions of the same subgraph
+// in it.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+
+  auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; });
+  });
+
+  add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, S1);
+        CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; });
+      },
+      S1);
+
+  auto ExecSubGraph = SubGraph.finalize();
+
+  auto P1 = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; });
+  });
+
+  auto P2 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, P1);
+        CGH.ext_oneapi_graph(ExecSubGraph);
+      },
+      P1);
+
+  auto P3 = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, P2);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; });
+      },
+      P2);
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, P3);
+        CGH.ext_oneapi_graph(ExecSubGraph);
+      },
+      P3);
+
+  auto ExecGraph = Graph.finalize();
+
+  auto E = Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), X, N * sizeof(float), E).wait();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Output[i] == -6.25f);
+  }
+
+  sycl::free(X, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp
new file mode 100644
index 0000000000000..886d0c9fc866c
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp
@@ -0,0 +1,143 @@
+// This tests nesting sub-graphs two deep inside a parent graph.
+
+#include "../graph_common.hpp"
+
+namespace {
+// Calculates reference result at index i
+float reference(size_t i) {
+  float x = static_cast<float>(i);
+  float y = static_cast<float>(i);
+  float z = static_cast<float>(i);
+
+  x = x * 2.0f + 0.5f;  // XSubSubGraph
+  y = y * 3.0f + 0.14f; // YSubSubGraph
+
+  // SubGraph
+  x = -x;
+  y = -y;
+
+  // Graph
+  z = z * x - y;
+
+  return z;
+}
+} // namespace
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph XSubSubGraph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph YSubSubGraph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  // XSubSubGraph is a multiply-add operation on USM allocation X
+  auto XSS1 = add_node(XSubSubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; });
+  });
+
+  add_node(
+      XSubSubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, XSS1);
+        CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; });
+      },
+      XSS1);
+
+  auto XExecSubSubGraph = XSubSubGraph.finalize();
+
+  // YSubSubGraph is a multiply-add operation on USM allocation Y
+  auto YSS1 = add_node(YSubSubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { Y[it] *= 3.0f; });
+  });
+
+  add_node(
+      YSubSubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, YSS1);
+        CGH.parallel_for(N, [=](id<1> it) { Y[it] += 0.14f; });
+      },
+      YSS1);
+
+  auto YExecSubSubGraph = YSubSubGraph.finalize();
+
+  // SubGraph initializes X & Y inputs, adds both subgraphs, then negates
+  // the results
+  auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = static_cast<float>(it);
+      Y[it] = static_cast<float>(it);
+    });
+  });
+
+  auto S2 = add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, S1);
+        CGH.ext_oneapi_graph(XExecSubSubGraph);
+      },
+      S1);
+
+  auto S3 = add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, S1);
+        CGH.ext_oneapi_graph(YExecSubSubGraph);
+      },
+      S1);
+
+  add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {S2, S3});
+        CGH.parallel_for(N, [=](id<1> it) {
+          X[it] = -X[it];
+          Y[it] = -Y[it];
+        });
+      },
+      S2, S3);
+
+  auto ExecSubGraph = SubGraph.finalize();
+
+  // Parent Graph initializes Z allocation, adds the sub-graph,then
+  // does a multiply add with X & Y allocation results.
+  auto G1 = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { Z[it] = static_cast<float>(it); });
+  });
+
+  auto G2 = add_node(Graph, Queue,
+                     [&](handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); });
+
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {G1, G2});
+        CGH.parallel_for(range<1>{N},
+                         [=](id<1> it) { Z[it] = Z[it] * X[it] - Y[it]; });
+      },
+      G1, G2);
+
+  auto ExecGraph = Graph.finalize();
+
+  auto E = Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Z, N * sizeof(float), E).wait();
+
+  for (size_t i = 0; i < N; i++) {
+    float ref = reference(i);
+    assert(Output[i] == ref);
+  }
+
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp
new file mode 100644
index 0000000000000..ccb168e28f805
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp
@@ -0,0 +1,71 @@
+// Modified version of the dotp example which submits which contains a sycl
+// reduction as well as a sub-graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  float *Dotp = malloc_device<float>(1, Queue);
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  auto NodeI = add_node(Graph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+
+  auto NodeA = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; });
+  });
+
+  auto NodeB = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; });
+  });
+
+  auto SubGraphExec = SubGraph.finalize();
+
+  auto NodeSub = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeI);
+        CGH.ext_oneapi_graph(SubGraphExec);
+      },
+      NodeI);
+
+  auto NodeC = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeSub);
+        CGH.parallel_for(range<1>{N}, reduction(Dotp, 0.0f, std::plus()),
+                         [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; });
+      },
+      NodeSub);
+
+  auto ExecGraph = Graph.finalize();
+
+  // Using shortcut for executing a graph of commands
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  float Output;
+  Queue.memcpy(&Output, Dotp, sizeof(float)).wait();
+  assert(Output == dotp_reference_result(N));
+
+  sycl::free(Dotp, Queue);
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp
new file mode 100644
index 0000000000000..5c9732f4cc509
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp
@@ -0,0 +1,102 @@
+// Tests adding an executable graph object as a sub-graph of two different
+// parent graphs.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+
+  auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; });
+  });
+
+  add_node(
+      SubGraph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, S1);
+        CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; });
+      },
+      S1);
+
+  auto ExecSubGraph = SubGraph.finalize();
+
+  auto A1 = add_node(GraphA, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; });
+  });
+
+  auto A2 = add_node(
+      GraphA, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, A1);
+        CGH.ext_oneapi_graph(ExecSubGraph);
+      },
+      A1);
+
+  add_node(
+      GraphA, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, A2);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; });
+      },
+      A2);
+
+  auto ExecGraphA = GraphA.finalize();
+
+  auto B1 = add_node(GraphB, Queue, [&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] = static_cast<float>(i); });
+  });
+
+  auto B2 = add_node(
+      GraphB, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, B1);
+        CGH.ext_oneapi_graph(ExecSubGraph);
+      },
+      B1);
+
+  add_node(
+      GraphB, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, B2);
+        CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= X[it]; });
+      },
+      B2);
+
+  auto ExecGraphB = GraphB.finalize();
+
+  auto EventA1 =
+      Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraphA); });
+  std::vector<float> OutputA(N);
+  auto EventA2 = Queue.memcpy(OutputA.data(), X, N * sizeof(float), EventA1);
+
+  auto EventB1 = Queue.submit([&](handler &CGH) {
+    CGH.depends_on(EventA2);
+    CGH.ext_oneapi_graph(ExecGraphB);
+  });
+  std::vector<float> OutputB(N);
+  Queue.memcpy(OutputB.data(), X, N * sizeof(float), EventB1);
+  Queue.wait();
+
+  auto refB = [](size_t i) {
+    float result = static_cast<float>(i);
+    result = result * 2.0f + 0.5f;
+    result *= result;
+    return result;
+  };
+
+  for (size_t i = 0; i < N; i++) {
+    assert(OutputA[i] == -2.5f);
+    assert(OutputB[i] == refB(i));
+  }
+
+  sycl::free(X, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp
new file mode 100644
index 0000000000000..97098dd22e191
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp
@@ -0,0 +1,55 @@
+// This test creates a temporary buffer (which is reinterpreted from the main
+// application buffers) which is used in kernels but destroyed before
+// finalization and execution of the graph. The original buffers lifetime
+// extends until after execution of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  {
+    exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+    buffer<T> BufferA{DataA.data(), range<1>{DataA.size()}};
+    buffer<T> BufferB{DataB.data(), range<1>{DataB.size()}};
+    buffer<T> BufferC{DataC.data(), range<1>{DataC.size()}};
+
+    {
+      // Create some temporary buffers only for adding nodes
+      auto BufferA2 = BufferA.reinterpret<T, 1>(BufferA.get_range());
+      auto BufferB2 = BufferB.reinterpret<T, 1>(BufferB.get_range());
+      auto BufferC2 = BufferC.reinterpret<T, 1>(BufferC.get_range());
+
+      add_nodes(Graph, Queue, Size, BufferA2, BufferB2, BufferC2);
+    }
+    auto GraphExec = Graph.finalize();
+
+    event Event;
+    for (unsigned n = 0; n < Iterations; n++) {
+      Event = Queue.submit([&](handler &CGH) {
+        CGH.depends_on(Event);
+        CGH.ext_oneapi_graph(GraphExec);
+      });
+    }
+    // Perform a wait on all graph submissions.
+    Queue.wait_and_throw();
+  }
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp
new file mode 100644
index 0000000000000..26e737682e892
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp
@@ -0,0 +1,122 @@
+// Tests adding a usm memcpy node and submitting the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (unsigned i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceA[j] = ReferenceB[j];
+      ReferenceA[j] += ModValue;
+      ReferenceB[j] = ReferenceA[j];
+      ReferenceB[j] += ModValue;
+      ReferenceC[j] = ReferenceB[j];
+    }
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // Copy from B to A
+  auto NodeA =
+      add_node(Graph, Queue, [&](handler &CGH) { CGH.copy(PtrB, PtrA, Size); });
+
+  // Read & write A
+  auto NodeB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeA);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          PtrA[LinID] += ModValue;
+        });
+      },
+      NodeA);
+
+  // Read & write B
+  auto NodeModB = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeA);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          PtrB[LinID] += ModValue;
+        });
+      },
+      NodeA);
+
+  // memcpy from A to B
+  auto NodeC = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, {NodeB, NodeModB});
+        CGH.memcpy(PtrB, PtrA, Size * sizeof(T));
+      },
+      NodeB, NodeModB);
+
+  // Read and write B
+  auto NodeD = add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeC);
+        CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+          auto LinID = id.get_linear_id();
+          PtrB[LinID] += ModValue;
+        });
+      },
+      NodeC);
+
+  // Copy from B to C
+  add_node(
+      Graph, Queue,
+      [&](handler &CGH) {
+        depends_on_helper(CGH, NodeD);
+        CGH.copy(PtrB, PtrC, Size);
+      },
+      NodeD);
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+
+  Queue.copy(PtrA, DataA.data(), Size, Event);
+  Queue.copy(PtrB, DataB.data(), Size, Event);
+  Queue.copy(PtrC, DataC.data(), Size, Event);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill.cpp
new file mode 100644
index 0000000000000..3b173ee245109
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/usm_fill.cpp
@@ -0,0 +1,30 @@
+// Tests adding a USM fill operation as a graph node.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_device<float>(N, Queue);
+
+  float Pattern = 3.14f;
+  auto NodeA =
+      add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); });
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (int i = 0; i < N; i++)
+    assert(Output[i] == Pattern);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp
new file mode 100644
index 0000000000000..b1408bf0d6106
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp
@@ -0,0 +1,31 @@
+// Tests adding a host USM fill operation as a graph node.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+  if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) {
+    return 0;
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_host<float>(N, Queue);
+
+  float Pattern = 3.14f;
+  auto NodeA =
+      add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); });
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  for (int i = 0; i < N; i++)
+    assert(Arr[i] == Pattern);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp
new file mode 100644
index 0000000000000..cf42e35685ffb
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp
@@ -0,0 +1,32 @@
+// Tests adding a shared USM fill operation as a graph node.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) {
+    return 0;
+  }
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *Arr = malloc_shared<float>(N, Queue);
+
+  float Pattern = 3.14f;
+  auto NodeA =
+      add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); });
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  for (int i = 0; i < N; i++)
+    assert(Arr[i] == Pattern);
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
new file mode 100644
index 0000000000000..be0bcef2c8934
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+//
+// Temporarily disabled until failure is addressed.
+// UNSUPPORTED: windows
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/add_nodes_after_finalize.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
new file mode 100644
index 0000000000000..0add9af73c95a
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
@@ -0,0 +1,74 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// This test attempts recording a set of kernels after they have already been
+// executed once before.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
+                           ReferenceC);
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  // run commands first
+  event Event = run_kernels_usm(Queue, Size, PtrA, PtrB, PtrC);
+  Queue.wait_and_throw();
+
+  Graph.begin_recording(Queue);
+  run_kernels_usm(Queue, Size, PtrA, PtrB, PtrC);
+  Graph.end_recording();
+
+  auto GraphExec = Graph.finalize();
+
+  // Execute several iterations of the graph (first iteration has already run
+  // before graph recording)
+  for (unsigned n = 1; n < Iterations; n++) {
+    Event = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(Event);
+      CGH.ext_oneapi_graph(GraphExec);
+    });
+  }
+  Queue.wait_and_throw();
+
+  Queue.copy(PtrA, DataA.data(), Size);
+  Queue.copy(PtrB, DataB.data(), Size);
+  Queue.copy(PtrC, DataC.data(), Size);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  assert(ReferenceA == DataA);
+  assert(ReferenceB == DataB);
+  assert(ReferenceC == DataC);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
new file mode 100644
index 0000000000000..7d0c7c81d780f
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
new file mode 100644
index 0000000000000..65b7a146fbf95
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_usm.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
new file mode 100644
index 0000000000000..c3492b6d26722
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_usm_host.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
new file mode 100644
index 0000000000000..b4b7f26ceebbf
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_usm_mixed.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
new file mode 100644
index 0000000000000..c3a140d64eae4
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_usm_shared.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
new file mode 100644
index 0000000000000..e731b586885ac
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/basic_usm_system.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp
new file mode 100644
index 0000000000000..77270b1e9bebe
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp
new file mode 100644
index 0000000000000..d00aa10368368
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_2d.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp
new file mode 100644
index 0000000000000..7364dea5c7779
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_host2target.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp
new file mode 100644
index 0000000000000..b650bc67faeb7
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_host2target_2d.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp
new file mode 100644
index 0000000000000..9f2cb1b787902
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_host2target_offset.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp
new file mode 100644
index 0000000000000..05922690d99f4
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_offsets.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp
new file mode 100644
index 0000000000000..1954e2c5bfef8
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_target2host.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp
new file mode 100644
index 0000000000000..2c3eaa28e7ad2
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_target2host_2d.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp
new file mode 100644
index 0000000000000..22f8934482d5e
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_copy_target2host_offset.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
new file mode 100644
index 0000000000000..9910cc82d6e6b
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/buffer_ordering.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
new file mode 100644
index 0000000000000..a25b0ff4c5a11
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
@@ -0,0 +1,33 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests attempting to begin recording to a graph when recording is
+// already in progress on another graph throws an error.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  bool Success = false;
+
+  exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
+  GraphA.begin_recording(Queue);
+
+  try {
+    exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()};
+    GraphB.begin_recording(Queue);
+  } catch (sycl::exception &E) {
+    Success = E.code() == errc::invalid;
+  }
+
+  GraphA.end_recording();
+
+  assert(Success);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp
new file mode 100644
index 0000000000000..af577686832cd
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/dotp_buffer_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
new file mode 100644
index 0000000000000..756fccdc99611
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
@@ -0,0 +1,71 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests a dotp operation using device USM and an in-order queue.
+
+#include "../graph_common.hpp"
+
+int main() {
+  property_list properties{property::queue::in_order()};
+  queue Queue{properties};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  float *Dotp = malloc_device<float>(1, Queue);
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  Graph.begin_recording(Queue);
+
+  auto InitEvent = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+
+  auto EventA = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; });
+  });
+
+  auto EventB = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; });
+  });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.single_task([=]() {
+      for (size_t j = 0; j < N; j++) {
+        Dotp[0] += X[j] * Z[j];
+      }
+    });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  float Output;
+  Queue.memcpy(&Output, Dotp, sizeof(float)).wait();
+
+  assert(Output == dotp_reference_result(N));
+
+  sycl::free(Dotp, Queue);
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
new file mode 100644
index 0000000000000..2e41e5a85f5b9
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
@@ -0,0 +1,77 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests a dotp operation using device USM and an in-order queue with empty
+// nodes. The second run is to check that there are no leaks reported with the
+// embedded ZE_DEBUG=4 testing capability.
+
+#include "../graph_common.hpp"
+
+int main() {
+  property_list Properties{property::queue::in_order()};
+  queue Queue{Properties};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  float *Dotp = malloc_device<float>(1, Queue);
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  Graph.begin_recording(Queue);
+
+  auto InitEvent = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+
+  auto Empty1 = Queue.submit([&](handler &) {});
+
+  auto EventA = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; });
+  });
+
+  auto EventB = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; });
+  });
+
+  auto Empty2 = Queue.submit([&](handler &) {});
+
+  Queue.submit([&](handler &CGH) {
+    CGH.single_task([=]() {
+      for (size_t j = 0; j < N; j++) {
+        Dotp[0] += X[j] * Z[j];
+      }
+    });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  float Output;
+  Queue.memcpy(&Output, Dotp, sizeof(float)).wait();
+
+  assert(Output == dotp_reference_result(N));
+
+  sycl::free(Dotp, Queue);
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
new file mode 100644
index 0000000000000..b0d988ca6deda
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
@@ -0,0 +1,75 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests a dotp operation split between 2 in-order queues using device USM.
+
+#include "../graph_common.hpp"
+
+int main() {
+
+  property_list properties{property::queue::in_order()};
+  queue QueueA{properties};
+  queue QueueB{QueueA.get_context(), QueueA.get_device(), properties};
+
+  exp_ext::command_graph Graph{QueueA.get_context(), QueueA.get_device()};
+
+  float *Dotp = malloc_device<float>(1, QueueA);
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, QueueA);
+  float *Y = malloc_device<float>(N, QueueA);
+  float *Z = malloc_device<float>(N, QueueA);
+
+  Graph.begin_recording(QueueA);
+  Graph.begin_recording(QueueB);
+
+  QueueA.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+
+  auto Event = QueueA.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; });
+  });
+
+  QueueB.submit([&](handler &CGH) {
+    CGH.depends_on(Event); // needed for cross queue dependency
+    CGH.parallel_for(range<1>{N},
+                     [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; });
+  });
+
+  QueueB.submit([&](handler &CGH) {
+    CGH.single_task([=]() {
+      for (size_t j = 0; j < N; j++) {
+        Dotp[0] += X[j] * Z[j];
+      }
+    });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+
+  QueueA.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  float Output;
+  QueueA.memcpy(&Output, Dotp, sizeof(float)).wait();
+
+  assert(Output == dotp_reference_result(N));
+
+  sycl::free(Dotp, QueueA);
+  sycl::free(X, QueueA);
+  sycl::free(Y, QueueA);
+  sycl::free(Z, QueueA);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp
new file mode 100644
index 0000000000000..dab4b34eec79d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/dotp_usm_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp
new file mode 100644
index 0000000000000..edfaab45bb417
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update not yet implemented
+// XFAIL: *
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/double_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
new file mode 100644
index 0000000000000..967cfbaaf58e8
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/empty_node.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp
new file mode 100644
index 0000000000000..f1a9ae3e49d1a
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp
@@ -0,0 +1,9 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out 2>&1 | FileCheck %s
+//
+// CHECK: complete
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/event_status_querying.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp
new file mode 100644
index 0000000000000..e1d912129951d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp
@@ -0,0 +1,28 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+//
+
+// This test checks that an expection is thrown when we try to
+// record a graph whose context differs from the queue context.
+// We ensure that the exception code matches the expected code.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  context InOrderContext;
+
+  exp_ext::command_graph Graph{InOrderContext, Queue.get_device()};
+
+  std::error_code ExceptionCode = make_error_code(sycl::errc::success);
+  try {
+    Graph.begin_recording(Queue);
+  } catch (exception &Exception) {
+    ExceptionCode = Exception.code();
+  }
+  assert(ExceptionCode == sycl::errc::invalid);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp
new file mode 100644
index 0000000000000..cd8fd46a72b6d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp
@@ -0,0 +1,46 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run-unfiltered-devices} %t.out
+//
+
+// This test checks that an expection is thrown when we try to
+// record a graph whose device differs from the queue device.
+// We ensure that the exception code matches the expected code.
+
+#include "../graph_common.hpp"
+
+int GetLZeroBackend(const sycl::device &Dev) {
+  // Return 1 if the device backend is "Level_zero" or 0 else.
+  // 0 does not prevent another device to be picked as a second choice
+  return Dev.get_backend() == backend::ext_oneapi_level_zero;
+}
+
+int GetOtherBackend(const sycl::device &Dev) {
+  // Return 1 if the device backend is not "Level_zero" or 0 else.
+  // 0 does not prevent another device to be picked as a second choice
+  return Dev.get_backend() != backend::ext_oneapi_level_zero;
+}
+
+int main() {
+  sycl::device Dev0{GetLZeroBackend};
+  sycl::device Dev1{GetOtherBackend};
+
+  if (Dev0 == Dev1) {
+    // Skip if we don't have two different devices
+    std::cout << "Test skipped: the devices are the same" << std::endl;
+    return 0;
+  }
+
+  queue Queue{Dev1};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Dev0};
+
+  std::error_code ExceptionCode = make_error_code(sycl::errc::success);
+  try {
+    Graph.begin_recording(Queue);
+  } catch (exception &Exception) {
+    ExceptionCode = Exception.code();
+  }
+  assert(ExceptionCode == sycl::errc::invalid);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp
new file mode 100644
index 0000000000000..84272c20dcd1d
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update not implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/executable_graph_update.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp
new file mode 100644
index 0000000000000..89d724d106301
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp
@@ -0,0 +1,15 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as executable graph update and host tasks both aren't
+// implemented.
+// XFAIL: *
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/executable_graph_update_ordering"
diff --git a/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp
new file mode 100644
index 0000000000000..906abbd7277b0
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp
@@ -0,0 +1,28 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests the ability to finalize a command graph while it is currently being
+// recorded to.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  Graph.begin_recording(Queue);
+
+  try {
+    Graph.finalize();
+  } catch (sycl::exception &E) {
+    assert(false && "Exception thrown on finalize.\n");
+  }
+
+  Graph.end_recording();
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
new file mode 100644
index 0000000000000..ee88c065efccf
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as host tasks are not implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/host_task.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
new file mode 100644
index 0000000000000..8a59f12d316b4
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/multiple_exec_graphs.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
new file mode 100644
index 0000000000000..4bc6ed9af6976
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/queue_shortcuts.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
new file mode 100644
index 0000000000000..3a702d025b3d3
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/repeated_exec.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/return_values.cpp b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp
new file mode 100644
index 0000000000000..db733d87c7e54
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp
@@ -0,0 +1,34 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests the return values from queue graph functions which change the
+// internal queue state.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  bool ChangedState = Graph.end_recording();
+  assert(ChangedState == false);
+
+  ChangedState = Graph.begin_recording(Queue);
+  assert(ChangedState == true);
+
+  ChangedState = Graph.begin_recording(Queue);
+  assert(ChangedState == false);
+
+  ChangedState = Graph.end_recording();
+  assert(ChangedState == true);
+
+  ChangedState = Graph.end_recording();
+  assert(ChangedState == false);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/stream.cpp b/sycl/test-e2e/Graph/RecordReplay/stream.cpp
new file mode 100644
index 0000000000000..0cf1a4da36712
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/stream.cpp
@@ -0,0 +1,30 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Expected fail as sycl::stream is not implemented yet
+// XFAIL: *
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/stream.cpp"
+
+// CHECK-DAG: Val: 1
+// CHECK-DAG: Val: 2
+// CHECK-DAG: Val: 3
+// CHECK-DAG: Val: 4
+// CHECK-DAG: Val: 5
+// CHECK-DAG: Val: 6
+// CHECK-DAG: Val: 7
+// CHECK-DAG: Val: 8
+// CHECK-DAG: Val: 9
+// CHECK-DAG: Val: 10
+// CHECK-DAG: Val: 11
+// CHECK-DAG: Val: 12
+// CHECK-DAG: Val: 13
+// CHECK-DAG: Val: 14
+// CHECK-DAG: Val: 15
+// CHECK-DAG: Val: 16
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
new file mode 100644
index 0000000000000..62b1d074e4bb9
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
new file mode 100644
index 0000000000000..4921e51cc98aa
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph_execute_without_parent.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
new file mode 100644
index 0000000000000..0f21e3b6fefd9
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
@@ -0,0 +1,63 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests adding a sub-graph to an in-order queue.
+
+#include "../graph_common.hpp"
+
+int main() {
+  property_list properties{property::queue::in_order()};
+  queue Queue{properties};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+
+  SubGraph.begin_recording(Queue);
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; });
+  });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; });
+  });
+
+  SubGraph.end_recording(Queue);
+
+  auto ExecSubGraph = SubGraph.finalize();
+
+  Graph.begin_recording(Queue);
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; });
+  });
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] += 3.0f; });
+  });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  float Output;
+  Queue.memcpy(&Output, X, sizeof(float)).wait();
+
+  assert(Output == 5.5f);
+
+  sycl::free(X, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
new file mode 100644
index 0000000000000..b8f4bfa3b3b83
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// XFAIL:*
+// Submit a graph as a subgraph more than once doesn't yet work.
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph_multiple_submission.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
new file mode 100644
index 0000000000000..6fc2b39efade3
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph_nested.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp
new file mode 100644
index 0000000000000..f3e58b1ef99ff
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph_reduction.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
new file mode 100644
index 0000000000000..2d2eb17cd7078
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// XFAIL: *
+// Subgraph doesn't work properly in second parent graph
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/sub_graph_two_parent_graphs.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp
new file mode 100644
index 0000000000000..d0303a90b9bd8
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp
@@ -0,0 +1,87 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Fail that needs investigation
+// XFAIL: *
+
+// This test creates a temporary buffer which is used in kernels, but
+// destroyed before finalization and execution of the graph.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  using T = int;
+
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  std::vector<T> ReferenceC(DataC);
+  for (unsigned n = 0; n < Iterations; n++) {
+    for (size_t i = 0; i < Size; i++) {
+      ReferenceC[i] += (DataA[i] + DataB[i]) + 1;
+    }
+  }
+
+  {
+    exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+    buffer<T> BufferA{DataA.data(), range<1>{DataA.size()}};
+    BufferA.set_write_back(false);
+    buffer<T> BufferB{DataB.data(), range<1>{DataB.size()}};
+    BufferB.set_write_back(false);
+    buffer<T> BufferC{DataC.data(), range<1>{DataC.size()}};
+    BufferC.set_write_back(false);
+
+    Graph.begin_recording(Queue);
+
+    // Create a temporary output buffer to use between kernels.
+    {
+      buffer<T> BufferTemp{range<1>{DataA.size()}};
+      BufferTemp.set_write_back(false);
+
+      // Vector add to temporary output buffer
+      Queue.submit([&](handler &CGH) {
+        auto PtrA = BufferA.get_access<access::mode::read>(CGH);
+        auto PtrB = BufferB.get_access<access::mode::read>(CGH);
+        auto PtrOut = BufferTemp.get_access<access::mode::write>(CGH);
+        CGH.parallel_for(range<1>(Size),
+                         [=](item<1> id) { PtrOut[id] = PtrA[id] + PtrB[id]; });
+      });
+
+      // Modify temp buffer and write to output buffer
+      Queue.submit([&](handler &CGH) {
+        auto PtrTemp = BufferTemp.get_access<access::mode::read>(CGH);
+        auto PtrOut = BufferC.get_access<access::mode::write>(CGH);
+        CGH.parallel_for(range<1>(Size),
+                         [=](item<1> id) { PtrOut[id] += PtrTemp[id] + 1; });
+      });
+      Graph.end_recording();
+    }
+    auto GraphExec = Graph.finalize();
+
+    event Event;
+    for (unsigned n = 0; n < Iterations; n++) {
+      Event = Queue.submit([&](handler &CGH) {
+        CGH.depends_on(Event);
+        CGH.ext_oneapi_graph(GraphExec);
+      });
+    }
+    Queue.wait_and_throw();
+
+    host_accessor HostAccC(BufferC);
+    for (size_t i = 0; i < Size; i++) {
+      assert(ReferenceC[i] == HostAccC[i]);
+    }
+  }
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
new file mode 100644
index 0000000000000..a51bcc967b2ee
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/temp_buffer_reinterpret.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
new file mode 100644
index 0000000000000..6cc286bc2c96b
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
@@ -0,0 +1,53 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests that a command-group function can capture variables by reference
+// and still work correctly as a graph node.
+
+#include "../graph_common.hpp"
+
+const size_t N = 10;
+const float ExpectedValue = 42.0f;
+
+void run_some_kernel(queue Queue, float *Data) {
+  // 'Data' is captured by ref here but will have gone out of scope when the
+  // CGF is later run when the graph is executed.
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> idx) {
+      size_t i = idx;
+      Data[i] = ExpectedValue;
+    });
+  });
+}
+
+int main() {
+
+  queue Queue{default_selector_v};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  float *Arr = malloc_device<float>(N, Queue);
+
+  Graph.begin_recording(Queue);
+  run_some_kernel(Queue, Arr);
+  Graph.end_recording(Queue);
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait();
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait();
+  for (size_t i = 0; i < N; i++) {
+    assert(Output[i] == ExpectedValue);
+  }
+
+  sycl::free(Arr, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
new file mode 100644
index 0000000000000..b24dc65614f1e
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/usm_copy.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp
new file mode 100644
index 0000000000000..c2b7c786232e4
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp
@@ -0,0 +1,83 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests memcpy operation using device USM and an in-order queue.
+
+#include "../graph_common.hpp"
+
+int main() {
+  property_list properties{property::queue::in_order()};
+  queue Queue{properties};
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  const size_t N = 10;
+  float *X = malloc_device<float>(N, Queue);
+  float *Y = malloc_device<float>(N, Queue);
+  float *Z = malloc_device<float>(N, Queue);
+
+  // Shouldn't be captured in graph as a dependency
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 0.0f;
+      Y[it] = 0.0f;
+      Z[it] = 0.0f;
+    });
+  });
+
+  Graph.begin_recording(Queue);
+
+  auto InitEvent = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] = 1.0f;
+      Y[it] = 2.0f;
+      Z[it] = 3.0f;
+    });
+  });
+  Graph.end_recording(Queue);
+
+  // Shouldn't be captured in graph as a dependency
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(N, [=](id<1> it) {
+      X[it] += 0.5f;
+      Y[it] += 0.5f;
+      Z[it] += 0.5f;
+    });
+  });
+
+  Graph.begin_recording(Queue);
+  // memcpy 1 values from X to Y
+  Queue.submit([&](handler &CGH) { CGH.memcpy(Y, X, N * sizeof(float)); });
+
+  // Double Y to 2.0
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{N}, [=](id<1> it) { Y[it] *= 2.0f; });
+  });
+
+  // memcpy from 2.0 Y values to Z
+  Queue.submit([&](handler &CGH) { CGH.memcpy(Z, Y, N * sizeof(float)); });
+
+  Graph.end_recording();
+
+  auto ExecGraph = Graph.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
+
+  std::vector<float> Output(N);
+  Queue.memcpy(Output.data(), Z, N * sizeof(float)).wait();
+
+  for (size_t i = 0; i < N; i++) {
+    assert(Output[i] == 2.0f);
+  }
+
+  sycl::free(X, Queue);
+  sycl::free(Y, Queue);
+  sycl::free(Z, Queue);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
new file mode 100644
index 0000000000000..db225c3efa0e3
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/usm_fill.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp
new file mode 100644
index 0000000000000..ccd130320a528
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/usm_fill_host.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
new file mode 100644
index 0000000000000..ca6e79542f356
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/usm_fill_shared.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp
new file mode 100644
index 0000000000000..cd0f5df8946a9
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp
@@ -0,0 +1,31 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// Tests obtaining a finalized, executable graph from a graph which is
+// currently being recorded to without end_recording() being called.
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  {
+    queue MyQueue(Queue.get_context(), Queue.get_device());
+    Graph.begin_recording(MyQueue);
+  }
+
+  try {
+    auto GraphExec = Graph.finalize();
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  } catch (sycl::exception &E) {
+    assert(false && "Exception thrown on finalize or submission.\n");
+  }
+  Queue.wait();
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/device_query.cpp b/sycl/test-e2e/Graph/device_query.cpp
new file mode 100644
index 0000000000000..fe778fadd7519
--- /dev/null
+++ b/sycl/test-e2e/Graph/device_query.cpp
@@ -0,0 +1,23 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests the using device query for graphs support, and that the return value
+// matches expectations.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  auto Device = Queue.get_device();
+
+  exp_ext::info::graph_support_level SupportsGraphs =
+      Device.get_info<exp_ext::info::device::graph_support>();
+  auto Backend = Device.get_backend();
+
+  if (Backend == backend::ext_oneapi_level_zero) {
+    assert(SupportsGraphs == exp_ext::info::graph_support_level::native);
+  } else {
+    assert(SupportsGraphs == exp_ext::info::graph_support_level::unsupported);
+  }
+}
diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp
new file mode 100644
index 0000000000000..90eb43986275a
--- /dev/null
+++ b/sycl/test-e2e/Graph/empty_graph.cpp
@@ -0,0 +1,27 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests the ability to finalize and submit a command graph which doesn't
+// contain any nodes.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  std::error_code ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    auto GraphExec = Graph.finalize();
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+    Queue.wait_and_throw();
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::success);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/finalize_twice.cpp b/sycl/test-e2e/Graph/finalize_twice.cpp
new file mode 100644
index 0000000000000..31c178e8eeae5
--- /dev/null
+++ b/sycl/test-e2e/Graph/finalize_twice.cpp
@@ -0,0 +1,25 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests calling finalize() more than once on the same command_graph.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+  auto GraphExec = Graph.finalize();
+
+  std::error_code ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    auto GraphExec2 = Graph.finalize();
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::success);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp
new file mode 100644
index 0000000000000..c5ccc1ba6d271
--- /dev/null
+++ b/sycl/test-e2e/Graph/graph_common.hpp
@@ -0,0 +1,409 @@
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+#include <numeric>
+
+// Test constants.
+constexpr size_t Size = 1024;      // Number of data elements in a buffer.
+constexpr unsigned Iterations = 5; // Iterations of graph to execute.
+constexpr size_t Offset = 100; // Number of offset elements for Buffer accessors
+
+// Namespace alias to use in test code.
+namespace exp_ext = sycl::ext::oneapi::experimental;
+// Make tests less verbose by using sycl namespace.
+using namespace sycl;
+
+// Helper functions for wrapping depends_on calls when add_node is used so they
+// are not used in the explicit API
+template <typename T> inline void depends_on_helper(sycl::handler &CGH, T Dep) {
+#ifdef GRAPH_E2E_RECORD_REPLAY
+  CGH.depends_on(Dep);
+#endif
+  (void)CGH;
+  (void)Dep;
+}
+
+template <typename T>
+inline void depends_on_helper(sycl::handler &CGH,
+                              std::initializer_list<T> DepList) {
+#ifdef GRAPH_E2E_RECORD_REPLAY
+  CGH.depends_on(DepList);
+#endif
+  (void)CGH;
+  (void)DepList;
+}
+
+// We have 4 versions of the same kernel sequence for testing a combination
+// of graph construction API against memory model. Each submits the same pattern
+/// of 4 kernels with a diamond dependency.
+//
+//                 |    Buffers    |      USM          |
+// ----------------|---------------|-------------------|
+// Record & Replay | run_kernels() | run_kernels_usm() |
+// ----------------|---------------|-------------------|
+// Explicit API    | add_kernels() | add_kernels_usm() |
+
+/// Calculates reference data on the host for a given number of executions
+/// @param[in] Iterations Number of iterations of kernel sequence to run.
+/// @param[in] Size Number of elements in vectors
+/// @param[in,out] ReferenceA First input/output.
+/// @param[in,out] ReferenceB Second input/output.
+/// @param[in,out] ReferenceC Third input/output.
+template <typename T>
+void calculate_reference_data(size_t Iterations, size_t Size,
+                              std::vector<T> &ReferenceA,
+                              std::vector<T> &ReferenceB,
+                              std::vector<T> &ReferenceC) {
+  for (size_t n = 0; n < Iterations; n++) {
+    for (size_t i = 0; i < Size; i++) {
+      ReferenceA[i]++;
+      ReferenceB[i] += ReferenceA[i];
+      ReferenceC[i] -= ReferenceA[i];
+      ReferenceB[i]--;
+      ReferenceC[i]--;
+    }
+  }
+}
+
+/// Test Record and Replay graph construction with buffers.
+///
+/// @param Q Queue to submit nodes to.
+/// @param Size Number of elements in the buffers.
+/// @param BufferA First input/output to use in kernels.
+/// @param BufferB Second input/output to use in kernels.
+/// @param BufferC Third input/output to use in kernels.
+///
+/// @return An event corresponding to the exit node of the submissions sequence.
+template <typename T>
+event run_kernels(queue Q, const size_t Size, buffer<T> BufferA,
+                  buffer<T> BufferB, buffer<T> BufferC) {
+  // Read & write Buffer A.
+  Q.submit([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) { DataA[Id]++; });
+  });
+
+  // Reads Buffer A.
+  // Read & Write Buffer B.
+  Q.submit([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read>(CGH);
+    auto DataB = BufferB.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> Id) { DataB[Id] += DataA[Id]; });
+  });
+
+  // Reads Buffer A.
+  // Read & writes Buffer C
+  Q.submit([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read>(CGH);
+    auto DataC = BufferC.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> Id) { DataC[Id] -= DataA[Id]; });
+  });
+
+  // Read & write Buffers B and C.
+  auto ExitEvent = Q.submit([&](handler &CGH) {
+    auto DataB = BufferB.template get_access<access::mode::read_write>(CGH);
+    auto DataC = BufferC.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      DataB[Id]--;
+      DataC[Id]--;
+    });
+  });
+
+  return ExitEvent;
+}
+
+/// Test Explicit API graph construction with buffers.
+///
+/// @param Graph Modifiable graph to add commands to.
+/// @param Size Number of elements in the buffers.
+/// @param BufferA First input/output to use in kernels.
+/// @param BufferB Second input/output to use in kernels.
+/// @param BufferC Third input/output to use in kernels.
+///
+/// @return Exit node of the submission sequence.
+template <typename T>
+exp_ext::node
+add_kernels(exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph,
+            const size_t Size, buffer<T> BufferA, buffer<T> BufferB,
+            buffer<T> BufferC) {
+  // Read & write Buffer A
+  Graph.add([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) { DataA[Id]++; });
+  });
+
+  // Reads Buffer A
+  // Read & Write Buffer B
+  Graph.add([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read>(CGH);
+    auto DataB = BufferB.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> Id) { DataB[Id] += DataA[Id]; });
+  });
+
+  // Reads Buffer A
+  // Read & writes Buffer C
+  Graph.add([&](handler &CGH) {
+    auto DataA = BufferA.template get_access<access::mode::read>(CGH);
+    auto DataC = BufferC.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size),
+                     [=](item<1> Id) { DataC[Id] -= DataA[Id]; });
+  });
+
+  // Read & write Buffers B and C
+  auto ExitNode = Graph.add([&](handler &CGH) {
+    auto DataB = BufferB.template get_access<access::mode::read_write>(CGH);
+    auto DataC = BufferC.template get_access<access::mode::read_write>(CGH);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      DataB[Id]--;
+      DataC[Id]--;
+    });
+  });
+  return ExitNode;
+}
+
+//// Test Explicit API graph construction with USM.
+///
+/// @param Q Command-queue to make kernel submissions to.
+/// @param Size Number of elements in the buffers.
+/// @param DataA Pointer to first USM allocation to use in kernels.
+/// @param DataB Pointer to second USM allocation to use in kernels.
+/// @param DataC Pointer to third USM allocation to use in kernels.
+///
+/// @return Event corresponding to the exit node of the submission sequence.
+template <typename T>
+event run_kernels_usm(queue Q, const size_t Size, T *DataA, T *DataB,
+                      T *DataC) {
+  // Read & write Buffer A
+  auto EventA = Q.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      auto LinID = Id.get_linear_id();
+      DataA[LinID]++;
+    });
+  });
+
+  // Reads Buffer A
+  // Read & Write Buffer B
+  auto EventB = Q.submit([&](handler &CGH) {
+    CGH.depends_on(EventA);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      auto LinID = Id.get_linear_id();
+      DataB[LinID] += DataA[LinID];
+    });
+  });
+
+  // Reads Buffer A
+  // Read & writes Buffer C
+  auto EventC = Q.submit([&](handler &CGH) {
+    CGH.depends_on(EventA);
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      auto LinID = Id.get_linear_id();
+      DataC[LinID] -= DataA[LinID];
+    });
+  });
+
+  // Read & write Buffers B and C
+  auto ExitEvent = Q.submit([&](handler &CGH) {
+    CGH.depends_on({EventB, EventC});
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      auto LinID = Id.get_linear_id();
+      DataB[LinID]--;
+      DataC[LinID]--;
+    });
+  });
+  return ExitEvent;
+}
+
+/// Test Explicit API graph construction with USM.
+///
+/// @param Graph Modifiable graph to add commands to.
+/// @param Size Number of elements in the buffers.
+/// @param DataA Pointer to first USM allocation to use in kernels.
+/// @param DataB Pointer to second USM allocation to use in kernels.
+/// @param DataC Pointer to third USM allocation to use in kernels.
+///
+/// @return Exit node of the submission sequence.
+template <typename T>
+exp_ext::node
+add_kernels_usm(exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph,
+                const size_t Size, T *DataA, T *DataB, T *DataC) {
+  // Read & write Buffer A
+  auto NodeA = Graph.add([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+      auto LinID = Id.get_linear_id();
+      DataA[LinID]++;
+    });
+  });
+
+  // Reads Buffer A
+  // Read & Write Buffer B
+  auto NodeB = Graph.add(
+      [&](handler &CGH) {
+        CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+          auto LinID = Id.get_linear_id();
+          DataB[LinID] += DataA[LinID];
+        });
+      },
+      {exp_ext::property::node::depends_on(NodeA)});
+
+  // Reads Buffer A
+  // Read & writes Buffer C
+  auto NodeC = Graph.add(
+      [&](handler &CGH) {
+        CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+          auto LinID = Id.get_linear_id();
+          DataC[LinID] -= DataA[LinID];
+        });
+      },
+      {exp_ext::property::node::depends_on(NodeA)});
+
+  // Read & write data B and C
+  auto ExitNode = Graph.add(
+      [&](handler &CGH) {
+        CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
+          auto LinID = Id.get_linear_id();
+          DataB[LinID]--;
+          DataC[LinID]--;
+        });
+      },
+      {exp_ext::property::node::depends_on(NodeB, NodeC)});
+
+  return ExitNode;
+}
+
+/// Adds a common series of nodes to a graph forming a diamond dependency
+/// structure using USM pointers for the inputs. Can be used for either Explicit
+/// or Record and Replay with the choice being dictated by defining one of
+/// GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY.
+///
+/// @param Graph Modifiable graph to add commands to.
+/// @param Queue Queue to be used for record and replay.
+/// @param Size Number of elements in the buffers.
+/// @param DataA Pointer to first USM allocation to use in kernels.
+/// @param DataB Pointer to second USM allocation to use in kernels.
+/// @param DataC Pointer to third USM allocation to use in kernels.
+///
+/// @return If using Explicit API this will be the last node added, if Record
+/// and Replay this will be an event corresponding to the last submission.
+template <typename T>
+auto add_nodes(exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph,
+               queue Queue, const size_t Size, T *DataA, T *DataB, T *DataC) {
+#if defined(GRAPH_E2E_EXPLICIT)
+  return add_kernels_usm(Graph, Size, DataA, DataB, DataC);
+#elif defined(GRAPH_E2E_RECORD_REPLAY)
+  Graph.begin_recording(Queue);
+  auto ev = run_kernels_usm(Queue, Size, DataA, DataB, DataC);
+  Graph.end_recording(Queue);
+  return ev;
+#else
+  assert(0 && "Error: Cannot use add_nodes without selecting an API");
+#endif
+}
+
+/// Adds a common series of nodes to a graph forming a diamond dependency
+/// structure using Buffers for the inputs. Can be used for either Explicit
+/// or Record and Replay with the choice being dictated by defining one of
+/// GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY.
+///
+/// @param Graph Modifiable graph to add commands to.
+/// @param Queue Queue to be used for record and replay.
+/// @param Size Number of elements in the buffers.
+/// @param BufferA First input/output to use in kernels.
+/// @param BufferB Second input/output to use in kernels.
+/// @param BufferC Third input/output to use in kernels.
+///
+/// @return If using Explicit API this will be the last node added, if Record
+/// and Replay API this will be an event corresponding to the last submission.
+template <typename T>
+auto add_nodes(exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph,
+               queue Queue, const size_t Size, buffer<T> BufferA,
+               buffer<T> BufferB, buffer<T> BufferC) {
+#if defined(GRAPH_E2E_EXPLICIT)
+  return add_kernels(Graph, Size, BufferA, BufferB, BufferC);
+#elif defined(GRAPH_E2E_RECORD_REPLAY)
+  Graph.begin_recording(Queue);
+  auto ev = run_kernels(Queue, Size, BufferA, BufferB, BufferC);
+  Graph.end_recording(Queue);
+  return ev;
+#else
+  assert(0 && "Error: Cannot use add_nodes without selecting an API");
+#endif
+}
+
+/// Adds a single node to the graph in an API agnostic way. Can be used for
+/// either Explicit or Record and Replay with the choice being dictated by
+/// defining one of GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY.
+///
+/// @tparam CGFunc Type of the command group function.
+/// @tparam DepT Type of all the dependencies.
+/// @param Graph Modifiable graph to add commands to.
+/// @param Queue Queue to be used for record and replay.
+/// @param CGF The command group function representing the node
+/// @param Deps Parameter pack of dependencies, if they are Nodes we pass them
+/// to explicit API add, otherwise they are ignored.
+/// @return If using the Explicit API this will be the node that was added, if
+/// Record and Replay this will be an event representing the submission.
+template <typename CGFunc, typename... DepT>
+auto add_node(exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph,
+              queue Queue, CGFunc CGF, DepT... Deps) {
+#if defined(GRAPH_E2E_EXPLICIT)
+  if constexpr ((std::is_same_v<exp_ext::node, DepT> && ...)) {
+    return Graph.add(CGF, {exp_ext::property::node::depends_on(Deps...)});
+  } else {
+    return Graph.add(CGF);
+  }
+#elif defined(GRAPH_E2E_RECORD_REPLAY)
+  Graph.begin_recording(Queue);
+  auto ev = Queue.submit(CGF);
+  Graph.end_recording(Queue);
+  return ev;
+#else
+  assert(0 && "Error: Cannot use add_node without selecting an API");
+#endif
+}
+
+/// Adds an empty node to the graph in an API agnostic way. Can be used for
+/// either Explicit or Record and Replay with the choice being dictated by
+/// defining one of GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY.
+///
+/// @tparam DepT Type of all the dependencies.
+/// @param Graph Modifiable graph to add commands to.
+/// @param Queue Queue to be used for record and replay.
+/// @param Deps Parameter pack of dependencies, if they are Nodes we pass them
+/// to explicit API add, otherwise they are passed as events to queue::submit()
+/// for the empty node submission.
+/// @return If using the Explicit API this will be the node that was added, if
+/// Record and Replay this will be an event representing the submission.
+template <typename... DepT>
+auto add_empty_node(
+    exp_ext::command_graph<exp_ext::graph_state::modifiable> Graph, queue Queue,
+    DepT... Deps) {
+#if defined(GRAPH_E2E_EXPLICIT)
+  if constexpr ((std::is_same_v<exp_ext::node, DepT> && ...)) {
+    return Graph.add({exp_ext::property::node::depends_on(Deps...)});
+  } else {
+    return Graph.add();
+  }
+#elif defined(GRAPH_E2E_RECORD_REPLAY)
+  Graph.begin_recording(Queue);
+  auto ev = Queue.submit(
+      [&](sycl::handler &CGH) { CGH.depends_on(std::vector<event>{Deps...}); });
+  Graph.end_recording(Queue);
+  return ev;
+#else
+  assert(0 && "Error: Cannot use add_empty_node without selecting an API");
+#endif
+}
+
+// Values for dotp tests
+constexpr float Alpha = 1.0f;
+constexpr float Beta = 2.0f;
+constexpr float Gamma = 3.0f;
+
+// Reference function for dotp
+float dotp_reference_result(size_t N) {
+  return N * (Alpha * 1.0f + Beta * 2.0f) * (Gamma * 3.0f + Beta * 2.0f);
+}
diff --git a/sycl/test-e2e/Graph/invalid_depends_on.cpp b/sycl/test-e2e/Graph/invalid_depends_on.cpp
new file mode 100644
index 0000000000000..cd9ee51303c12
--- /dev/null
+++ b/sycl/test-e2e/Graph/invalid_depends_on.cpp
@@ -0,0 +1,79 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests that calling handler::depends_on() for events not part of the graph
+// throws.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+  ext::oneapi::experimental::command_graph Graph2{Queue.get_context(),
+                                                  Queue.get_device()};
+
+  auto NormalEvent = Queue.submit(
+      [&](handler &CGH) { CGH.single_task<class TestKernel1>([=]() {}); });
+
+  Graph2.begin_recording(Queue);
+
+  auto OtherGraphEvent = Queue.submit(
+      [&](handler &CGH) { CGH.single_task<class TestKernel2>([=]() {}); });
+
+  Graph2.end_recording(Queue);
+
+  Graph.begin_recording(Queue);
+
+  // Test that depends_on in explicit and record and replay throws from an event
+  // outside any graph.
+
+  std::error_code ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    auto GraphEvent = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(NormalEvent);
+      CGH.single_task<class TestKernel3>([=]() {});
+    });
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    Graph.add([&](handler &CGH) {
+      CGH.depends_on(NormalEvent);
+      CGH.single_task<class TestKernel4>([=]() {});
+    });
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  // Test that depends_on throws from an event from another graph.
+  ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    auto GraphEvent = Queue.submit([&](handler &CGH) {
+      CGH.depends_on(OtherGraphEvent);
+      CGH.single_task<class TestKernel5>([=]() {});
+    });
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    Graph.add([&](handler &CGH) {
+      CGH.depends_on(OtherGraphEvent);
+      CGH.single_task<class TestKernel6>([=]() {});
+    });
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/invalid_event_wait.cpp b/sycl/test-e2e/Graph/invalid_event_wait.cpp
new file mode 100644
index 0000000000000..bb7f4ad6bedda
--- /dev/null
+++ b/sycl/test-e2e/Graph/invalid_event_wait.cpp
@@ -0,0 +1,31 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests that waiting on an event returned from a Record and Replay submission
+// throws.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+  Graph.begin_recording(Queue);
+
+  auto GraphEvent = Queue.submit(
+      [&](handler &CGH) { CGH.single_task<class TestKernel>([=]() {}); });
+
+  Graph.end_recording(Queue);
+
+  std::error_code ErrorCode = make_error_code(sycl::errc::success);
+  try {
+    GraphEvent.wait();
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/invalid_queue_wait.cpp b/sycl/test-e2e/Graph/invalid_queue_wait.cpp
new file mode 100644
index 0000000000000..8ba8c7d1c2125
--- /dev/null
+++ b/sycl/test-e2e/Graph/invalid_queue_wait.cpp
@@ -0,0 +1,26 @@
+// REQUIRES: level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Tests that waiting on a Queue in recording mode throws.
+
+#include "graph_common.hpp"
+
+int main() {
+  queue Queue;
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+  Graph.begin_recording(Queue);
+
+  std::error_code ErrorCode = make_error_code(sycl::errc::success);
+
+  try {
+    Queue.wait();
+  } catch (const sycl::exception &e) {
+    ErrorCode = e.code();
+  }
+  assert(ErrorCode == sycl::errc::invalid);
+
+  return 0;
+}
diff --git a/sycl/test/extensions/macro_graph.cpp b/sycl/test/extensions/macro_graph.cpp
new file mode 100644
index 0000000000000..bca987f3ef243
--- /dev/null
+++ b/sycl/test/extensions/macro_graph.cpp
@@ -0,0 +1,16 @@
+// This test checks presence of macros for available extensions.
+// RUN: %clangxx -fsycl -fsyntax-only %s
+
+#include <sycl/sycl.hpp>
+
+#if SYCL_EXT_ONEAPI_GRAPH == 1
+constexpr bool macro_defined = true;
+#else
+constexpr bool macro_defined = false;
+#endif
+
+int main() {
+  static_assert(macro_defined);
+
+  return 0;
+}
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index 1193439f196cc..590d86b8e0019 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -37,6 +37,20 @@ class CommandGraphTest : public ::testing::Test {
   experimental::command_graph<experimental::graph_state::modifiable> Graph;
 };
 
+TEST_F(CommandGraphTest, QueueState) {
+  experimental::queue_state State = Queue.ext_oneapi_get_state();
+  ASSERT_EQ(State, experimental::queue_state::executing);
+
+  experimental::command_graph Graph{Queue.get_context(), Queue.get_device()};
+  Graph.begin_recording(Queue);
+  State = Queue.ext_oneapi_get_state();
+  ASSERT_EQ(State, experimental::queue_state::recording);
+
+  Graph.end_recording();
+  State = Queue.ext_oneapi_get_state();
+  ASSERT_EQ(State, experimental::queue_state::executing);
+}
+
 TEST_F(CommandGraphTest, AddNode) {
   auto GraphImpl = sycl::detail::getSyclObjImpl(Graph);