diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp new file mode 100644 index 0000000000000..2f5376a3a536c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp @@ -0,0 +1,37 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected Fail as exception not implemented yet +// XFAIL: * + +// Tests attempting to add a node to a command_graph while it is being +// recorded to by a queue is an error. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + bool Success = false; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + Graph.begin_recording(Queue); + + try { + Graph.add([&](handler &CGH) {}); + } catch (sycl::exception &E) { + auto StdErrc = E.code().value(); + if (StdErrc == static_cast(errc::invalid)) { + Success = true; + } + } + + Graph.end_recording(); + assert(Success); + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp new file mode 100644 index 0000000000000..8ad6a413aea03 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/add_nodes_after_finalize.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp new file mode 100644 index 0000000000000..6191a875bbe41 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_buffer.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp new file mode 100644 index 0000000000000..c7adb7f282da4 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_usm.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp new file mode 100644 index 0000000000000..79e53ff4ba9d9 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_usm_host.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp new file mode 100644 index 0000000000000..fa5a2a1f018e6 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_usm_mixed.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp new file mode 100644 index 0000000000000..1b7447940e1fe --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_usm_shared.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp new file mode 100644 index 0000000000000..26e5473bded66 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/basic_usm_system.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp new file mode 100644 index 0000000000000..3c291d4d44393 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp new file mode 100644 index 0000000000000..446d75316e6e2 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_2d.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp new file mode 100644 index 0000000000000..8c233ec8de66e --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_host2target.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp new file mode 100644 index 0000000000000..9c33e885ce8a5 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_host2target_2d.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp new file mode 100644 index 0000000000000..2c26c24744f0e --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_host2target_offset.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp new file mode 100644 index 0000000000000..746b41f4e0a76 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_offsets.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp new file mode 100644 index 0000000000000..e3a9ceb3160a2 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_target2host.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp new file mode 100644 index 0000000000000..f9945ebf3ee58 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_target2host_2d.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp new file mode 100644 index 0000000000000..c51b9e445137c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_copy_target2host_offset.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp new file mode 100644 index 0000000000000..2c2edd374febc --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/buffer_ordering.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/depends_on.cpp b/sycl/test-e2e/Graph/Explicit/depends_on.cpp new file mode 100644 index 0000000000000..6454a0eeeaa0e --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/depends_on.cpp @@ -0,0 +1,49 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests that an event returned from adding a graph node using the queue +// recording API can be passed to `handler::depends_on` inside a node +// added using the explicit API. This should create a graph edge. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + // `Event` corresponds to a graph node + event Event = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Arr[idx] = 42.0f; }); + }); + Graph.end_recording(Queue); + + Graph.add([&](handler &CGH) { + CGH.depends_on(Event); // creates edge to recorded graph node + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Arr[idx] *= 2.0f; }); + }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + constexpr float ref = 42.0f * 2.0f; + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == ref); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp new file mode 100644 index 0000000000000..da9bdc71ba466 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as reduction support is not complete. +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/dotp_buffer_reduction.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp new file mode 100644 index 0000000000000..c54477e61dac6 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as reduction support is not complete. +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/dotp_usm_reduction.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/double_buffer.cpp b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp new file mode 100644 index 0000000000000..94e09289577ff --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update isn't implemented yet +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/double_buffer.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp new file mode 100644 index 0000000000000..687a25b923d78 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/empty_node.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp new file mode 100644 index 0000000000000..6948090873181 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp @@ -0,0 +1,89 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Test submitting the same graph twice with another command in between, this +// intermediate command depends on the first submission of the graph, and +// is a dependency of the second submission of the graph. + +#include "../graph_common.hpp" +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_shared(N, Queue); + + // Buffer elements set to 0.5 + auto E1 = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] = 0.5f; + }); + }); + + Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] += 0.25f; + }); + }); + + // Buffer elements set to 1.5 + auto E2 = Queue.submit([&](handler &CGH) { + CGH.depends_on(E1); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] += 1.0f; + }); + }); + + auto ExecGraph = Graph.finalize(); + + // Buffer elements set to 3.0 + auto E3 = Queue.submit([&](handler &CGH) { + CGH.depends_on(E2); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] *= 2.0f; + }); + }); + + // Buffer elements set to 3.25 + auto E4 = Queue.submit([&](handler &CGH) { + CGH.depends_on(E3); + CGH.ext_oneapi_graph(ExecGraph); + }); + + // Buffer elements set to 6.5 + auto E5 = Queue.submit([&](handler &CGH) { + CGH.depends_on(E4); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] *= 2.0f; + }); + }); + + // Buffer elements set to 6.75 + Queue.submit([&](handler &CGH) { + CGH.depends_on(E5); + CGH.ext_oneapi_graph(ExecGraph); + }); + + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(Arr[i] == 6.75f); + } + + // Free the allocated memory + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp new file mode 100644 index 0000000000000..4d5831d494aa1 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp @@ -0,0 +1,9 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out 2>&1 | FileCheck %s +// +// CHECK: complete + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/event_status_querying.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp new file mode 100644 index 0000000000000..bcb9eeed01071 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update not implemented yet +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/executable_graph_update.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp new file mode 100644 index 0000000000000..dacd28e730139 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp @@ -0,0 +1,15 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update and host tasks both aren't +// implemented. +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/executable_graph_update_ordering" diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp new file mode 100644 index 0000000000000..1f6b7d931f194 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as host tasks aren't implemented yet. +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/host_task.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..a414e3f4b8d6c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/multiple_exec_graphs.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp new file mode 100644 index 0000000000000..233013ba5cf8f --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp @@ -0,0 +1,51 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests that graph.make_edge() correctly defines the dependency between two +// nodes. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + + auto Init = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] = 2.0f; }); + }); + + auto Add = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] += 2.0f; }); + }); + + auto Mult = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { X[idx] *= 3.0f; }); + }); + + Graph.make_edge(Init, Mult); + Graph.make_edge(Mult, Add); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), X, N * sizeof(float)).wait(); + + for (int i = 0; i < N; i++) + assert(Output[i] == 8.0f); + + sycl::free(X, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp new file mode 100644 index 0000000000000..68e89093c15b1 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/queue_shortcuts.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp new file mode 100644 index 0000000000000..305831a6abb9b --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/repeated_exec.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/single_node.cpp b/sycl/test-e2e/Graph/Explicit/single_node.cpp new file mode 100644 index 0000000000000..766f6f08de281 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/single_node.cpp @@ -0,0 +1,52 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests adding a node to the graph with explicit API works as expected. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_device(N, Queue); + + float ZeroPattern = 0.0f; + Queue.fill(Arr, ZeroPattern, N).wait(); + + Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] = 3.14f; + }); + }); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 0); + + auto ExecGraph = Graph.finalize(); + + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 0); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 3.14f); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Explicit/stream.cpp b/sycl/test-e2e/Graph/Explicit/stream.cpp new file mode 100644 index 0000000000000..5c3ba8764ff51 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/stream.cpp @@ -0,0 +1,30 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as sycl streams aren't implemented yet +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/stream.cpp" + +// CHECK-DAG: Val: 1 +// CHECK-DAG: Val: 2 +// CHECK-DAG: Val: 3 +// CHECK-DAG: Val: 4 +// CHECK-DAG: Val: 5 +// CHECK-DAG: Val: 6 +// CHECK-DAG: Val: 7 +// CHECK-DAG: Val: 8 +// CHECK-DAG: Val: 9 +// CHECK-DAG: Val: 10 +// CHECK-DAG: Val: 11 +// CHECK-DAG: Val: 12 +// CHECK-DAG: Val: 13 +// CHECK-DAG: Val: 14 +// CHECK-DAG: Val: 15 +// CHECK-DAG: Val: 16 diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp new file mode 100644 index 0000000000000..154ea4e3470e3 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp new file mode 100644 index 0000000000000..edce73a46ad73 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph_execute_without_parent.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp new file mode 100644 index 0000000000000..14e447c04104f --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// XFAIL:* +// Submit a graph as a subgraph more than once doesn't yet work. + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph_multiple_submission.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp new file mode 100644 index 0000000000000..fe906bb7aba14 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph_nested.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp new file mode 100644 index 0000000000000..73c1ddc5d520c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as reduction support is not complete. +// XFAIL: * + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph_reduction.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp new file mode 100644 index 0000000000000..cbf768203ec5a --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// XFAIL: * +// Subgraph doesn't work properly in second parent graph + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/sub_graph_two_parent_graphs.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp new file mode 100644 index 0000000000000..d5b3ff7412b61 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/temp_buffer_reinterpret.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp new file mode 100644 index 0000000000000..e0771a3e6d082 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_copy.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp new file mode 100644 index 0000000000000..463cc77b09871 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_fill.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp new file mode 100644 index 0000000000000..bf53345453a4b --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_fill_host.cpp" diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp new file mode 100644 index 0000000000000..134dda9aaff6c --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/usm_fill_shared.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp new file mode 100644 index 0000000000000..527beb526a177 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/add_nodes_after_finalize.cpp @@ -0,0 +1,90 @@ +// This test adds a new node to an already finalized +// modifiable graph, before finalizing and executing the graph for a second +// time. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = unsigned int; + + std::vector DataA(Size), DataB(Size), DataC(Size), DataOut(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + std::iota(DataOut.begin(), DataOut.end(), 1000); + + std::vector ReferenceC(DataC); + std::vector ReferenceOut(DataOut); + for (unsigned n = 0; n < Iterations * 2; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceC[i] += (DataA[i] + DataB[i]); + if (n >= Iterations) + ReferenceOut[i] += ReferenceC[i] + 1; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + T *PtrOut = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.copy(DataOut.data(), PtrOut, Size); + Queue.wait_and_throw(); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrC[id] += PtrA[id] + PtrB[id]; }); + }); + + auto GraphExec = Graph.finalize(); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeA); + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrOut[id] += PtrC[id] + 1; }); + }, + NodeA); + + auto GraphExecAdditional = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExecAdditional); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrC, DataC.data(), Size); + Queue.copy(PtrOut, DataOut.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + free(PtrOut, Queue); + + assert(ReferenceC == DataC); + assert(ReferenceOut == DataOut); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp new file mode 100644 index 0000000000000..0d34e3f51a822 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_buffer.cpp @@ -0,0 +1,56 @@ +// Tests adding nodes to a graph and submitting the graph +// using buffers accessors for inputs and outputs. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = unsigned short; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + BufferC.set_write_back(false); + + // Add commands to graph + add_nodes(Graph, Queue, Size, BufferA, BufferB, BufferC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + assert(ReferenceC[i] == HostAccC[i]); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp new file mode 100644 index 0000000000000..5a8c9291ff0ef --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_usm.cpp @@ -0,0 +1,61 @@ +// Tests basic adding of nodes with USM pointers, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp new file mode 100644 index 0000000000000..57f9ae372f627 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_host.cpp @@ -0,0 +1,65 @@ +// Tests basic adding of nodes with USM host pointers, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) { + return 0; + } + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_host(Size, Queue); + T *PtrB = malloc_host(Size, Queue); + T *PtrC = malloc_host(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp new file mode 100644 index 0000000000000..6ee9c7f2030e3 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_mixed.cpp @@ -0,0 +1,68 @@ +// Tests basic adding of nodes with mixed USM pointers, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) { + return 0; + } + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_shared(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_host(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp new file mode 100644 index 0000000000000..71aa234752bb4 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_shared.cpp @@ -0,0 +1,65 @@ +// Tests basic adding of nodes with USM shared pointers, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_shared(Size, Queue); + T *PtrB = malloc_shared(Size, Queue); + T *PtrC = malloc_shared(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp new file mode 100644 index 0000000000000..f29c0d3afe69c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/basic_usm_system.cpp @@ -0,0 +1,65 @@ +// Tests basic adding of nodes with USM system allocations, +// and submission of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + if (!Queue.get_device().has(sycl::aspect::usm_system_allocations)) { + return 0; + } + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = new T[Size]; + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to graph + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + delete[] PtrC; + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp new file mode 100644 index 0000000000000..56623b53b2d36 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy.cpp @@ -0,0 +1,123 @@ +// Tests adding a buffer copy node and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA}; + BufferA.set_write_back(false); + buffer BufferB{DataB}; + BufferB.set_write_back(false); + buffer BufferC{DataC}; + BufferC.set_write_back(false); + + // Copy from B to A + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); + }, + NodeA); + + // Read & write B + auto NodeModB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + NodeA); + + // memcpy from A to B + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + NodeB, NodeModB); + + // Read and write B + auto NodeD = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }, + NodeC); + + // Copy from B to C + add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + NodeD); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + assert(ReferenceC[i] == HostAccC[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp new file mode 100644 index 0000000000000..574cf9c84981c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_2d.cpp @@ -0,0 +1,121 @@ +// Tests adding buffer 2d copy nodes and submitting +// the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size * Size), DataB(Size * Size), DataC(Size * Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size * Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Make the buffers 2D so we can test the rect copy path + buffer BufferA{DataA.data(), range<2>(Size, Size)}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<2>(Size, Size)}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<2>(Size, Size)}; + BufferC.set_write_back(false); + + // Copy from B to A + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccA[id] += ModValue; }); + }, + NodeA); + + // Read & write B + auto NodeModB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + NodeA); + + // memcpy from A to B + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }, + NodeModB); + + // Read and write B + auto NodeD = add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<2>(Size, Size), + [=](item<2> id) { AccB[id] += ModValue; }); + }, + NodeC); + + // Copy from B to C + add_node( + Graph, Queue, + [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }, + NodeD); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + for (size_t j = 0; j < Size; j++) { + assert(ReferenceA[i * Size + j] == HostAccA[i][j]); + assert(ReferenceB[i * Size + j] == HostAccB[i][j]); + assert(ReferenceC[i * Size + j] == HostAccC[i][j]); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp new file mode 100644 index 0000000000000..8a25673ea6645 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target.cpp @@ -0,0 +1,40 @@ +// Tests adding buffer copy -- Host to Target (write path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(Size); + for (size_t i = 0; i < Size; i++) { + ReferenceA[i] = DataB[i]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA(DataA.data(), range<1>(Size)); + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(DataB.data(), AccA); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + host_accessor HostAccA(BufferA); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp new file mode 100644 index 0000000000000..9d581a9dbe76e --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_2d.cpp @@ -0,0 +1,43 @@ +// Tests adding 2d buffer copy -- Host to Target (write path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size * Size), DataB(Size * Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(DataA); + for (size_t i = 0; i < Size * Size; i++) { + ReferenceA[i] = DataB[i]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Make the buffers 2D so we can test the rect write path + buffer BufferA{DataA.data(), range<2>(Size, Size)}; + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(DataB.data(), AccA); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + host_accessor HostAccA(BufferA); + + for (size_t i = 0; i < Size; i++) { + for (size_t j = 0; j < Size; j++) { + assert(ReferenceA[i * Size + j] == HostAccA[i][j]); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp new file mode 100644 index 0000000000000..4bde5d8a2fa55 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_host2target_offset.cpp @@ -0,0 +1,44 @@ +// Tests adding buffer copy offset -- Host to Target (write path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size + Offset), DataB(Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(Size + Offset); + for (size_t i = 0; i < Size + Offset; i++) { + if (i < Offset) + ReferenceA[i] = DataA[i]; + else + ReferenceA[i] = DataB[i - Offset]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA(DataA.data(), range<1>(Size + Offset)); + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH, range<1>(Size), + id<1>(Offset)); + CGH.copy(DataB.data(), AccA); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + host_accessor HostAccA(BufferA); + + for (size_t i = 0; i < Size + Offset; i++) { + assert(ReferenceA[i] == HostAccA[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp new file mode 100644 index 0000000000000..a52aae9220617 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_offsets.cpp @@ -0,0 +1,56 @@ +// Tests adding buffer copy nodes with offsets +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + size_t OffsetSrc = 2 * size_t(Size / 4); + size_t OffsetDst = size_t(Size / 4); + std::vector DataA(Size), DataB(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB); + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = DataA[j]; + ReferenceB[j] = DataB[j]; + } + for (size_t j = OffsetDst; j < Size - (OffsetSrc - OffsetDst); j++) { + ReferenceB[j] = DataA[(j - OffsetDst) + OffsetSrc]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA}; + BufferA.set_write_back(false); + buffer BufferB{DataB}; + BufferB.set_write_back(false); + + // Copy from A to B + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access( + CGH, range<1>(Size - OffsetSrc), id<1>(OffsetSrc)); + auto AccB = BufferB.get_access( + CGH, range<1>(Size - OffsetDst), id<1>(OffsetDst)); + CGH.copy(AccA, AccB); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp new file mode 100644 index 0000000000000..460ecd4ee945f --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host.cpp @@ -0,0 +1,40 @@ +// Tests adding buffer copy -- Target to Host (read path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(Size), ReferenceB(Size); + for (size_t i = 0; i < Size; i++) { + ReferenceA[i] = DataA[i]; + ReferenceB[i] = DataA[i]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA(DataA.data(), range<1>(Size)); + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(AccA, DataB.data()); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == DataA[i]); + assert(ReferenceB[i] == DataB[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp new file mode 100644 index 0000000000000..69050d2a8a1c6 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_2d.cpp @@ -0,0 +1,43 @@ +// Tests adding 2d buffer copy -- Target to Host (rect read path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size * Size), DataB(Size * Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB); + for (size_t i = 0; i < Size * Size; i++) { + ReferenceA[i] = DataA[i]; + ReferenceB[i] = DataA[i]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + // Make the buffers 2D so we can test the rect read path + buffer BufferA{DataA.data(), range<2>(Size, Size)}; + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.copy(AccA, DataB.data()); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + host_accessor HostAccA(BufferA); + + for (size_t i = 0; i < Size * Size; i++) { + assert(ReferenceA[i] == DataA[i]); + assert(ReferenceB[i] == DataB[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp new file mode 100644 index 0000000000000..dcb6d290b1205 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_copy_target2host_offset.cpp @@ -0,0 +1,44 @@ +// Tests adding buffer copy with offset -- Target to Host (read path) -- nodes +// and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size); + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 1000); + + std::vector ReferenceA(Size), ReferenceB(Size); + for (size_t i = 0; i < Size; i++) { + ReferenceA[i] = DataA[i]; + if (i < (Size - Offset)) + ReferenceB[i] = DataA[i + Offset]; + else + ReferenceB[i] = DataB[i]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA(DataA.data(), range<1>(Size)); + BufferA.set_write_back(false); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access( + CGH, range<1>(Size - Offset), id<1>(Offset)); + CGH.copy(AccA, DataB.data()); + }); + + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }).wait(); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == DataA[i]); + assert(ReferenceB[i] == DataB[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp new file mode 100644 index 0000000000000..79305c69db52c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/buffer_ordering.cpp @@ -0,0 +1,97 @@ +// Tests that buffer accessors exhibit the correct behaviour when: +// * A node is added to the graph between two queue submissions which +// use the same buffer, but are not added to the graph. +// +// * A queue submission using the same buffer is made after finalization +// of the graph, but before graph execution. +// +// * The graph is submitted for execution twice separated by a queue +// submission using the same buffer, this should respect dependencies and +// create the correct ordering. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + std::vector Arr(N, 0.0f); + + buffer Buf{N}; + Buf.set_write_back(false); + + // Buffer elements set to 0.5 + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Acc[i] = 0.5f; + }); + }); + + add_node(Graph, Queue, [&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Acc[i] += 0.25f; + }); + }); + + for (size_t i = 0; i < N; i++) { + assert(Arr[i] == 0.0f); + } + + // Buffer elements set to 1.5 + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Acc[i] += 1.0f; + }); + }); + + auto ExecGraph = Graph.finalize(); + + for (size_t i = 0; i < N; i++) { + assert(Arr[i] == 0.0f); + } + + // Buffer elements set to 3.0 + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Acc[i] *= 2.0f; + }); + }); + + // Buffer elements set to 3.25 + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + // Buffer elements set to 6.5 + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Acc[i] *= 2.0f; + }); + }); + + // Buffer elements set to 6.75 + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + Queue.submit([&](handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.copy(Acc, Arr.data()); + }); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + assert(Arr[i] == 6.75f); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp new file mode 100644 index 0000000000000..9e64a0bdae5d8 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/dotp_buffer_reduction.cpp @@ -0,0 +1,84 @@ +// Tests creating a dotp operation which uses a sycl reduction with buffers. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + float DotpData = 0.f; + + const size_t N = 10; + std::vector XData(N); + std::vector YData(N); + std::vector ZData(N); + + { + buffer DotpBuf(&DotpData, range<1>(1)); + DotpBuf.set_write_back(false); + + buffer XBuf(XData); + XBuf.set_write_back(false); + buffer YBuf(YData); + YBuf.set_write_back(false); + buffer ZBuf(ZData); + ZBuf.set_write_back(false); + + auto NodeI = add_node(Graph, Queue, [&](handler &CGH) { + auto X = XBuf.get_access(CGH); + auto Y = YBuf.get_access(CGH); + auto Z = ZBuf.get_access(CGH); + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto NodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + auto X = XBuf.get_access(CGH); + auto Y = YBuf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { + X[it] = Alpha * X[it] + Beta * Y[it]; + }); + }, + NodeI); + + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + auto Y = YBuf.get_access(CGH); + auto Z = ZBuf.get_access(CGH); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { + Z[it] = Gamma * Z[it] + Beta * Y[it]; + }); + }, + NodeI); + + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + auto Dotp = DotpBuf.get_access(CGH); + auto X = XBuf.get_access(CGH); + auto Z = ZBuf.get_access(CGH); + CGH.parallel_for(range<1>{N}, + reduction(DotpBuf, CGH, 0.0f, std::plus()), + [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; }); + }, + NodeA, NodeB); + + auto ExecGraph = Graph.finalize(); + + // Using shortcut for executing a graph of commands + Queue.ext_oneapi_graph(ExecGraph).wait(); + + host_accessor HostAcc(DotpBuf); + assert(HostAcc[0] == dotp_reference_result(N)); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp new file mode 100644 index 0000000000000..50918b0fe9e8f --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/dotp_usm_reduction.cpp @@ -0,0 +1,71 @@ +// Tests constructing a graph using the explicit API to perform a dotp +// operation which uses a sycl reduction with USM memory. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + float *Dotp = malloc_device(1, Queue); + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + auto NodeI = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto NodeA = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeI); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { + X[it] = Alpha * X[it] + Beta * Y[it]; + }); + }, + NodeI); + + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeI); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { + Z[it] = Gamma * Z[it] + Beta * Y[it]; + }); + }, + NodeI); + + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {NodeA, NodeB}); + CGH.parallel_for(range<1>{N}, reduction(Dotp, 0.0f, std::plus()), + [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; }); + }, + NodeA, NodeB); + + auto ExecGraph = Graph.finalize(); + + // Using shortcut for executing a graph of commands + Queue.ext_oneapi_graph(ExecGraph).wait(); + + float Output; + Queue.memcpy(&Output, Dotp, sizeof(float)).wait(); + + assert(Output == dotp_reference_result(N)); + + sycl::free(Dotp, Queue); + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/double_buffer.cpp b/sycl/test-e2e/Graph/Inputs/double_buffer.cpp new file mode 100644 index 0000000000000..69ae653d1ab3d --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/double_buffer.cpp @@ -0,0 +1,102 @@ +// Tests executable graph update by creating a double buffering scenario, where +// a single graph is repeatedly executed then updated to swap between two sets +// of buffers. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + std::vector DataA2(Size), DataB2(Size), DataC2(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::iota(DataA2.begin(), DataA2.end(), 3); + std::iota(DataB2.begin(), DataB2.end(), 13); + std::iota(DataC2.begin(), DataC2.end(), 1333); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + std::vector ReferenceA2(DataA2), ReferenceB2(DataB2), ReferenceC2(DataC2); + + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + calculate_reference_data(Iterations, Size, ReferenceA2, ReferenceB2, + ReferenceC2); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + T *PtrA2 = malloc_device(Size, Queue); + T *PtrB2 = malloc_device(Size, Queue); + T *PtrC2 = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + + Queue.copy(DataA2.data(), PtrA, Size); + Queue.copy(DataB2.data(), PtrB, Size); + Queue.copy(DataC2.data(), PtrC, Size); + Queue.wait_and_throw(); + + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto ExecGraph = Graph.finalize(); + + // Create second graph using other buffer set + exp_ext::command_graph GraphUpdate{Queue.get_context(), Queue.get_device()}; + add_nodes(GraphUpdate, Queue, Size, PtrA, PtrB, PtrC); + + event Event; + for (unsigned i = 0; i < Iterations; i++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(ExecGraph); + }); + // Update to second set of buffers + ExecGraph.update(GraphUpdate); + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(ExecGraph); + }); + // Reset back to original buffers + ExecGraph.update(Graph); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + + Queue.copy(PtrA2, DataA2.data(), Size); + Queue.copy(PtrB2, DataB2.data(), Size); + Queue.copy(PtrC2, DataC2.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + free(PtrA2, Queue); + free(PtrB2, Queue); + free(PtrC2, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + assert(ReferenceA2 == DataA2); + assert(ReferenceB2 == DataB2); + assert(ReferenceC2 == DataC2); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/empty_node.cpp b/sycl/test-e2e/Graph/Inputs/empty_node.cpp new file mode 100644 index 0000000000000..c0b1dd7440fa7 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/empty_node.cpp @@ -0,0 +1,59 @@ +// Tests the interface for adding empty nodes and creating dependencies on those +// empty nodes. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + auto MyProperties = property_list{exp_ext::property::graph::no_cycle_check()}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device(), + MyProperties}; + + const size_t N = 10; + float *Arr = malloc_device(N, Queue); + + auto Start = add_empty_node(Graph, Queue); + + auto Init = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, Start); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] = 0; + }); + }, + Start); + + auto Empty = add_empty_node(Graph, Queue, Init); + auto Empty2 = add_empty_node(Graph, Queue, Empty); + auto Empty3 = add_node( + Graph, Queue, [&](handler &CGH) { depends_on_helper(CGH, Empty2); }, + Empty2); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, Empty2); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] = 1; + }); + }, + Empty2); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector HostData(N); + Queue.memcpy(HostData.data(), Arr, N * sizeof(float)).wait(); + + for (int i = 0; i < N; i++) + assert(HostData[i] == 1.f); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp new file mode 100644 index 0000000000000..04b5820a895b7 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/event_status_querying.cpp @@ -0,0 +1,140 @@ +// This test checks the querying of the state of an event +// returned from graph submission +// with event::get_info() +// An event should pass from the submitted state to the complete state. +// The running state seems to not be implemented by the level_zero backend. +// This test should display (in most execution environment): +// ----- +// submitted +// complete +// ----- +// However, the execution support may be fast enough to complete +// the computation before we reach the state monitoring query. +// In this case, the displayed output can be: +// ----- +// complete +// complete +// ----- +// We therefore only check that the complete state of the event +// in this test. + +#include "../graph_common.hpp" + +std::string event_status_name(sycl::info::event_command_status status) { + switch (status) { + case sycl::info::event_command_status::submitted: + return "submitted"; + case sycl::info::event_command_status::running: + return "running"; + case sycl::info::event_command_status::complete: + return "complete"; + default: + return "unknown (" + std::to_string(int(status)) + ")"; + } +} + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA}; + BufferA.set_write_back(false); + buffer BufferB{DataB}; + BufferB.set_write_back(false); + buffer BufferC{DataC}; + BufferC.set_write_back(false); + + // Copy from B to A + auto Init = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccB, AccA); + }); + + // Read & write A + auto Node1 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccA[LinID] += ModValue; + }); + }); + + // Read & write B + auto Node2 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }); + + // memcpy from A to B + auto Node3 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccA = BufferA.get_access(CGH); + auto AccB = BufferB.get_access(CGH); + CGH.copy(AccA, AccB); + }); + + // Read and write B + auto Node4 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + AccB[LinID] += ModValue; + }); + }); + + // Copy from B to C + auto Node5 = add_node(Graph, Queue, [&](handler &CGH) { + auto AccB = BufferB.get_access(CGH); + auto AccC = BufferC.get_access(CGH); + CGH.copy(AccB, AccC); + }); + + auto GraphExec = Graph.finalize(); + + sycl::event Event = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + auto Info = Event.get_info(); + std::cout << event_status_name(Info) << std::endl; + while ( + (Info = Event.get_info()) != + sycl::info::event_command_status::complete) { + } + std::cout << event_status_name(Info) << std::endl; + + Queue.wait_and_throw(); + + host_accessor HostAccA(BufferA); + host_accessor HostAccB(BufferB); + host_accessor HostAccC(BufferC); + + for (size_t i = 0; i < Size; i++) { + assert(ReferenceA[i] == HostAccA[i]); + assert(ReferenceB[i] == HostAccB[i]); + assert(ReferenceC[i] == HostAccC[i]); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp b/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp new file mode 100644 index 0000000000000..afd6106a2d6a7 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/executable_graph_update.cpp @@ -0,0 +1,101 @@ +// Tests executable graph update by creating two graphs with USM ptrs and +// attempting to update one from the other. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to first graph + add_nodes(GraphA, Queue, Size, PtrA, PtrB, PtrC); + auto GraphExec = GraphA.finalize(); + + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + + T *PtrA2 = malloc_device(Size, Queue); + T *PtrB2 = malloc_device(Size, Queue); + T *PtrC2 = malloc_device(Size, Queue); + + Queue.copy(DataA2.data(), PtrA2, Size); + Queue.copy(DataB2.data(), PtrB2, Size); + Queue.copy(DataC2.data(), PtrC2, Size); + Queue.wait_and_throw(); + + // Add commands to second graph + add_nodes(GraphB, Queue, Size, PtrA2, PtrB2, PtrC2); + + // Execute several Iterations of the graph for 1st set of buffers + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + GraphExec.update(GraphB); + + // Execute several Iterations of the graph for 2nd set of buffers + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + + Queue.copy(PtrA2, DataA2.data(), Size); + Queue.copy(PtrB2, DataB2.data(), Size); + Queue.copy(PtrC2, DataC2.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + free(PtrA2, Queue); + free(PtrB2, Queue); + free(PtrC2, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + assert(ReferenceA == DataA2); + assert(ReferenceB == DataB2); + assert(ReferenceC == DataC2); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp new file mode 100644 index 0000000000000..a358bd0359cb4 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/executable_graph_update_ordering.cpp @@ -0,0 +1,139 @@ +// Tests executable graph update by introducing a delay in to the update +// transactions dependencies to check correctness of behaviour. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue; + + using T = int; + + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + std::vector DataA(Size), DataB(Size), DataC(Size); + std::vector HostTaskOutput(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + auto DataA2 = DataA; + auto DataB2 = DataB; + auto DataC2 = DataC; + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_shared(Size, Queue); + T *PtrB = malloc_shared(Size, Queue); + T *PtrC = malloc_shared(Size, Queue); + T *PtrOut = malloc_shared(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Add commands to first graph + auto NodeA = add_nodes(GraphA, Queue, Size, PtrA, PtrB, PtrC); + + // host task to induce a wait for dependencies + add_node( + Graph, Queue, + [&](handler &CGH) { + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrOut[i] = PtrC[i]; + } + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + }); + }, + NodeA); + + auto GraphExec = GraphA.finalize(); + + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + + T *PtrA2 = malloc_shared(Size, Queue); + T *PtrB2 = malloc_shared(Size, Queue); + T *PtrC2 = malloc_shared(Size, Queue); + + Queue.copy(DataA2.data(), PtrA2, Size); + Queue.copy(DataB2.data(), PtrB2, Size); + Queue.copy(DataC2.data(), PtrC2, Size); + Queue.wait_and_throw(); + + // Adds commands to second graph + auto NodeB = add_nodes(GraphB, Queue, Size, PtrA2, PtrB2, PtrC2); + + // host task to match the graph topology, but we don't need to sleep this + // time because there is no following update. + add_node( + Graph, Queue, + [&](handler &CGH) { + // This should be access::target::host_task but it has not been + // implemented yet. + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrOut[i] = PtrC2[i]; + } + }); + }, + NodeB); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + GraphExec.update(GraphB); + + // Execute several Iterations of the graph for 2nd set of buffers + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.copy(PtrOut, HostTaskOutput.data(), Size); + + Queue.copy(PtrA2, DataA.data(), Size); + Queue.copy(PtrB2, DataB.data(), Size); + Queue.copy(PtrC2, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + free(PtrOut, Queue); + + free(PtrA2, Queue); + free(PtrB2, Queue); + free(PtrC2, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + assert(ReferenceC == HostTaskOutput); + + assert(ReferenceA == DataA2); + assert(ReferenceB == DataB2); + assert(ReferenceC == DataC2); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/host_task.cpp b/sycl/test-e2e/Graph/Inputs/host_task.cpp new file mode 100644 index 0000000000000..46917ad0ac241 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/host_task.cpp @@ -0,0 +1,88 @@ +// This test uses a host_task when adding a command_graph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + const T ModValue = T{7}; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector Reference(DataC); + for (unsigned n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + Reference[i] += (DataA[i] + DataB[i]) + ModValue + 1; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_shared(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Vector add to output + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrC[id] += PtrA[id] + PtrB[id]; }); + }); + + // Modify the output values in a host_task + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeA); + CGH.host_task([=]() { + for (size_t i = 0; i < Size; i++) { + PtrC[i] += ModValue; + } + }); + }, + NodeA); + + // Modify temp buffer and write to output buffer + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeB); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { PtrC[id] += 1; }); + }, + NodeB); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(Reference == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..8bdf3caa1415d --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/multiple_exec_graphs.cpp @@ -0,0 +1,62 @@ +// This test attempts creating multiple executable graphs from one modifiable +// graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // event Event = add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + // Queue.wait_and_throw(); + + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + // Finalize and execute several iterations of the graph + event Event; + for (unsigned n = 0; n < Iterations; n++) { + auto GraphExec = Graph.finalize(); + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp new file mode 100644 index 0000000000000..cda06fa461049 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/queue_shortcuts.cpp @@ -0,0 +1,65 @@ +// Tests queue shortcuts for executing a graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + add_nodes(Graph, Queue, Size, PtrA, PtrB, PtrC); + + auto GraphExec = Graph.finalize(); + + // Execute several iterations of the graph using the different shortcuts + event Event = Queue.ext_oneapi_graph(GraphExec); + + assert(Iterations > 2); + const size_t LoopIterations = Iterations - 2; + std::vector Events(LoopIterations); + for (unsigned n = 0; n < LoopIterations; n++) { + Events[n] = Queue.ext_oneapi_graph(GraphExec, Event); + } + + Queue.ext_oneapi_graph(GraphExec, Events).wait(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp b/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp new file mode 100644 index 0000000000000..8a6b1cab23bc3 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/repeated_exec.cpp @@ -0,0 +1,49 @@ +// Test executing a graph multiple times. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_device(N, Queue); + float ZeroPattern = 0.0f; + Queue.fill(Arr, ZeroPattern, N).wait(); + + add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Arr[i] += 1; + }); + }); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 0); + + auto ExecGraph = Graph.finalize(); + + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 0); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 1); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == 2); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/stream.cpp b/sycl/test-e2e/Graph/Inputs/stream.cpp new file mode 100644 index 0000000000000..fb3f347a71598 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/stream.cpp @@ -0,0 +1,39 @@ +// This test checks that we can use a stream when explicitly adding a +// command_graph node. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + size_t WorkItems = 16; + std::vector DataIn(WorkItems); + + std::iota(DataIn.begin(), DataIn.end(), 1); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrIn = malloc_device(WorkItems, Queue); + Queue.copy(DataIn.data(), PtrIn, WorkItems); + + add_node(Graph, Queue, [&](handler &CGH) { + sycl::stream Out(WorkItems * 16, 16, CGH); + CGH.parallel_for(range<1>(WorkItems), [=](item<1> id) { + Out << "Val: " << PtrIn[id.get_linear_id()] << sycl::endl; + }); + }); + + auto GraphExec = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + + Queue.wait_and_throw(); + + Queue.copy(PtrIn, DataIn.data(), Size); + + free(PtrIn, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp new file mode 100644 index 0000000000000..21638b443b368 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph.cpp @@ -0,0 +1,125 @@ +// This test creates a graph, finalizes it, then submits that as a subgraph of +// another graph, and executes that second graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = short; + + // Values used to modify data inside kernels. + const int ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size), DataOut(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + std::iota(DataOut.begin(), DataOut.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA); + std::vector ReferenceB(DataB); + std::vector ReferenceC(DataC); + std::vector ReferenceOut(DataOut); + for (unsigned n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i] += ModValue; + ReferenceB[i] += ModValue; + ReferenceC[i] = (ReferenceA[i] + ReferenceB[i]); + ReferenceC[i] -= ModValue; + ReferenceOut[i] = ReferenceC[i] + ModValue; + } + } + + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + T *PtrOut = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.copy(DataOut.data(), PtrOut, Size); + Queue.wait_and_throw(); + + // Add some operations to a graph which will later be submitted as part + // of another graph. + + // Vector add two values + auto NodeSubA = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrC[id] = PtrA[id] + PtrB[id]; }); + }); + + // Modify the output value with some other value + add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeSubA); + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrC[id] -= ModValue; }); + }, + NodeSubA); + + auto SubGraphExec = SubGraph.finalize(); + + exp_ext::command_graph MainGraph{Queue.get_context(), Queue.get_device()}; + + // Modify the input values. + auto NodeMainA = add_node(MainGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + PtrA[id] += ModValue; + PtrB[id] += ModValue; + }); + }); + + auto NodeMainB = add_node( + MainGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeMainA); + CGH.ext_oneapi_graph(SubGraphExec); + }, + NodeMainA); + + // Copy to another output buffer. + add_node( + MainGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeMainB); + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrOut[id] = PtrC[id] + ModValue; }); + }, + NodeMainB); + + // Finalize a graph with the additional kernel for writing out to + auto MainGraphExec = MainGraph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(MainGraphExec); + }); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.copy(PtrOut, DataOut.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + free(PtrOut, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + assert(ReferenceOut == DataOut); + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp new file mode 100644 index 0000000000000..5a7dbbabfd9e1 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_execute_without_parent.cpp @@ -0,0 +1,76 @@ +// Tests creating a parent graph which contains a subgraph while also executing +// the subgraph by itself. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + + auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 3.14f; }); + }); + + add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, S1); + CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; }); + }, + S1); + + auto ExecSubGraph = SubGraph.finalize(); + + auto G1 = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; }); + }); + + auto G2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, G1); + CGH.ext_oneapi_graph(ExecSubGraph); + }, + G1); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, G2); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; }); + }, + G2); + + auto ExecGraph = Graph.finalize(); + + auto Event1 = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.f; }); + }); + + auto Event2 = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event1); + CGH.ext_oneapi_graph(ExecSubGraph); + }); + + auto Event3 = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event2); + CGH.ext_oneapi_graph(ExecGraph); + }); + + std::vector Output(N); + Queue.memcpy(Output.data(), X, N * sizeof(float), Event3).wait(); + + const float ref = ((1.f * 3.14f + 0.5f) * 2.0f * 3.14f + 0.5f) * -1.f; + for (size_t i = 0; i < N; i++) { + assert(Output[i] == ref); + } + + sycl::free(X, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp new file mode 100644 index 0000000000000..81ad495da5268 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_multiple_submission.cpp @@ -0,0 +1,71 @@ +// Tests creating a parent graph with multiple submissions of the same subgraph +// in it. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + + auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; }); + }); + + add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, S1); + CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; }); + }, + S1); + + auto ExecSubGraph = SubGraph.finalize(); + + auto P1 = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; }); + }); + + auto P2 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, P1); + CGH.ext_oneapi_graph(ExecSubGraph); + }, + P1); + + auto P3 = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, P2); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; }); + }, + P2); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, P3); + CGH.ext_oneapi_graph(ExecSubGraph); + }, + P3); + + auto ExecGraph = Graph.finalize(); + + auto E = Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + std::vector Output(N); + Queue.memcpy(Output.data(), X, N * sizeof(float), E).wait(); + + for (size_t i = 0; i < N; i++) { + assert(Output[i] == -6.25f); + } + + sycl::free(X, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp new file mode 100644 index 0000000000000..886d0c9fc866c --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_nested.cpp @@ -0,0 +1,143 @@ +// This tests nesting sub-graphs two deep inside a parent graph. + +#include "../graph_common.hpp" + +namespace { +// Calculates reference result at index i +float reference(size_t i) { + float x = static_cast(i); + float y = static_cast(i); + float z = static_cast(i); + + x = x * 2.0f + 0.5f; // XSubSubGraph + y = y * 3.0f + 0.14f; // YSubSubGraph + + // SubGraph + x = -x; + y = -y; + + // Graph + z = z * x - y; + + return z; +} +} // namespace + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph XSubSubGraph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph YSubSubGraph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + // XSubSubGraph is a multiply-add operation on USM allocation X + auto XSS1 = add_node(XSubSubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; }); + }); + + add_node( + XSubSubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, XSS1); + CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; }); + }, + XSS1); + + auto XExecSubSubGraph = XSubSubGraph.finalize(); + + // YSubSubGraph is a multiply-add operation on USM allocation Y + auto YSS1 = add_node(YSubSubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { Y[it] *= 3.0f; }); + }); + + add_node( + YSubSubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, YSS1); + CGH.parallel_for(N, [=](id<1> it) { Y[it] += 0.14f; }); + }, + YSS1); + + auto YExecSubSubGraph = YSubSubGraph.finalize(); + + // SubGraph initializes X & Y inputs, adds both subgraphs, then negates + // the results + auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = static_cast(it); + Y[it] = static_cast(it); + }); + }); + + auto S2 = add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, S1); + CGH.ext_oneapi_graph(XExecSubSubGraph); + }, + S1); + + auto S3 = add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, S1); + CGH.ext_oneapi_graph(YExecSubSubGraph); + }, + S1); + + add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {S2, S3}); + CGH.parallel_for(N, [=](id<1> it) { + X[it] = -X[it]; + Y[it] = -Y[it]; + }); + }, + S2, S3); + + auto ExecSubGraph = SubGraph.finalize(); + + // Parent Graph initializes Z allocation, adds the sub-graph,then + // does a multiply add with X & Y allocation results. + auto G1 = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = static_cast(it); }); + }); + + auto G2 = add_node(Graph, Queue, + [&](handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); }); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {G1, G2}); + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = Z[it] * X[it] - Y[it]; }); + }, + G1, G2); + + auto ExecGraph = Graph.finalize(); + + auto E = Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + std::vector Output(N); + Queue.memcpy(Output.data(), Z, N * sizeof(float), E).wait(); + + for (size_t i = 0; i < N; i++) { + float ref = reference(i); + assert(Output[i] == ref); + } + + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp new file mode 100644 index 0000000000000..ccb168e28f805 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_reduction.cpp @@ -0,0 +1,71 @@ +// Modified version of the dotp example which submits which contains a sycl +// reduction as well as a sub-graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + float *Dotp = malloc_device(1, Queue); + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + auto NodeI = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto NodeA = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; }); + }); + + auto NodeB = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; }); + }); + + auto SubGraphExec = SubGraph.finalize(); + + auto NodeSub = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeI); + CGH.ext_oneapi_graph(SubGraphExec); + }, + NodeI); + + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeSub); + CGH.parallel_for(range<1>{N}, reduction(Dotp, 0.0f, std::plus()), + [=](id<1> it, auto &Sum) { Sum += X[it] * Z[it]; }); + }, + NodeSub); + + auto ExecGraph = Graph.finalize(); + + // Using shortcut for executing a graph of commands + Queue.ext_oneapi_graph(ExecGraph).wait(); + + float Output; + Queue.memcpy(&Output, Dotp, sizeof(float)).wait(); + assert(Output == dotp_reference_result(N)); + + sycl::free(Dotp, Queue); + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp new file mode 100644 index 0000000000000..5c9732f4cc509 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/sub_graph_two_parent_graphs.cpp @@ -0,0 +1,102 @@ +// Tests adding an executable graph object as a sub-graph of two different +// parent graphs. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + + auto S1 = add_node(SubGraph, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; }); + }); + + add_node( + SubGraph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, S1); + CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; }); + }, + S1); + + auto ExecSubGraph = SubGraph.finalize(); + + auto A1 = add_node(GraphA, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; }); + }); + + auto A2 = add_node( + GraphA, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, A1); + CGH.ext_oneapi_graph(ExecSubGraph); + }, + A1); + + add_node( + GraphA, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, A2); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= -1.0f; }); + }, + A2); + + auto ExecGraphA = GraphA.finalize(); + + auto B1 = add_node(GraphB, Queue, [&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] = static_cast(i); }); + }); + + auto B2 = add_node( + GraphB, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, B1); + CGH.ext_oneapi_graph(ExecSubGraph); + }, + B1); + + add_node( + GraphB, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, B2); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] *= X[it]; }); + }, + B2); + + auto ExecGraphB = GraphB.finalize(); + + auto EventA1 = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraphA); }); + std::vector OutputA(N); + auto EventA2 = Queue.memcpy(OutputA.data(), X, N * sizeof(float), EventA1); + + auto EventB1 = Queue.submit([&](handler &CGH) { + CGH.depends_on(EventA2); + CGH.ext_oneapi_graph(ExecGraphB); + }); + std::vector OutputB(N); + Queue.memcpy(OutputB.data(), X, N * sizeof(float), EventB1); + Queue.wait(); + + auto refB = [](size_t i) { + float result = static_cast(i); + result = result * 2.0f + 0.5f; + result *= result; + return result; + }; + + for (size_t i = 0; i < N; i++) { + assert(OutputA[i] == -2.5f); + assert(OutputB[i] == refB(i)); + } + + sycl::free(X, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp new file mode 100644 index 0000000000000..97098dd22e191 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/temp_buffer_reinterpret.cpp @@ -0,0 +1,55 @@ +// This test creates a temporary buffer (which is reinterpreted from the main +// application buffers) which is used in kernels but destroyed before +// finalization and execution of the graph. The original buffers lifetime +// extends until after execution of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + + { + // Create some temporary buffers only for adding nodes + auto BufferA2 = BufferA.reinterpret(BufferA.get_range()); + auto BufferB2 = BufferB.reinterpret(BufferB.get_range()); + auto BufferC2 = BufferC.reinterpret(BufferC.get_range()); + + add_nodes(Graph, Queue, Size, BufferA2, BufferB2, BufferC2); + } + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + // Perform a wait on all graph submissions. + Queue.wait_and_throw(); + } + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_copy.cpp b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp new file mode 100644 index 0000000000000..26e737682e892 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_copy.cpp @@ -0,0 +1,122 @@ +// Tests adding a usm memcpy node and submitting the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (unsigned i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] = ReferenceB[j]; + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] += ModValue; + ReferenceC[j] = ReferenceB[j]; + } + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // Copy from B to A + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.copy(PtrB, PtrA, Size); }); + + // Read & write A + auto NodeB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeA); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrA[LinID] += ModValue; + }); + }, + NodeA); + + // Read & write B + auto NodeModB = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeA); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrB[LinID] += ModValue; + }); + }, + NodeA); + + // memcpy from A to B + auto NodeC = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, {NodeB, NodeModB}); + CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); + }, + NodeB, NodeModB); + + // Read and write B + auto NodeD = add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeC); + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrB[LinID] += ModValue; + }); + }, + NodeC); + + // Copy from B to C + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeD); + CGH.copy(PtrB, PtrC, Size); + }, + NodeD); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + + Queue.copy(PtrA, DataA.data(), Size, Event); + Queue.copy(PtrB, DataB.data(), Size, Event); + Queue.copy(PtrC, DataC.data(), Size, Event); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill.cpp new file mode 100644 index 0000000000000..3b173ee245109 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_fill.cpp @@ -0,0 +1,30 @@ +// Tests adding a USM fill operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_device(N, Queue); + + float Pattern = 3.14f; + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == Pattern); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp new file mode 100644 index 0000000000000..b1408bf0d6106 --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_fill_host.cpp @@ -0,0 +1,31 @@ +// Tests adding a host USM fill operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + if (!Queue.get_device().has(sycl::aspect::usm_host_allocations)) { + return 0; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_host(N, Queue); + + float Pattern = 3.14f; + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + for (int i = 0; i < N; i++) + assert(Arr[i] == Pattern); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp new file mode 100644 index 0000000000000..cf42e35685ffb --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/usm_fill_shared.cpp @@ -0,0 +1,32 @@ +// Tests adding a shared USM fill operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + if (!Queue.get_device().has(sycl::aspect::usm_shared_allocations)) { + return 0; + } + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *Arr = malloc_shared(N, Queue); + + float Pattern = 3.14f; + auto NodeA = + add_node(Graph, Queue, [&](handler &CGH) { CGH.fill(Arr, Pattern, N); }); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + for (int i = 0; i < N; i++) + assert(Arr[i] == Pattern); + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp new file mode 100644 index 0000000000000..be0bcef2c8934 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK +// +// Temporarily disabled until failure is addressed. +// UNSUPPORTED: windows + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/add_nodes_after_finalize.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp new file mode 100644 index 0000000000000..0add9af73c95a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp @@ -0,0 +1,74 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// This test attempts recording a set of kernels after they have already been +// executed once before. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB, + ReferenceC); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + // run commands first + event Event = run_kernels_usm(Queue, Size, PtrA, PtrB, PtrC); + Queue.wait_and_throw(); + + Graph.begin_recording(Queue); + run_kernels_usm(Queue, Size, PtrA, PtrB, PtrC); + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + // Execute several iterations of the graph (first iteration has already run + // before graph recording) + for (unsigned n = 1; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + Queue.copy(PtrA, DataA.data(), Size); + Queue.copy(PtrB, DataB.data(), Size); + Queue.copy(PtrC, DataC.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + assert(ReferenceA == DataA); + assert(ReferenceB == DataB); + assert(ReferenceC == DataC); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp new file mode 100644 index 0000000000000..7d0c7c81d780f --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_buffer.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp new file mode 100644 index 0000000000000..65b7a146fbf95 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_usm.cpp" \ No newline at end of file diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp new file mode 100644 index 0000000000000..c3492b6d26722 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_usm_host.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp new file mode 100644 index 0000000000000..b4b7f26ceebbf --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_usm_mixed.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp new file mode 100644 index 0000000000000..c3a140d64eae4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_usm_shared.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp new file mode 100644 index 0000000000000..e731b586885ac --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/basic_usm_system.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp new file mode 100644 index 0000000000000..77270b1e9bebe --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp new file mode 100644 index 0000000000000..d00aa10368368 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_2d.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp new file mode 100644 index 0000000000000..7364dea5c7779 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_host2target.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp new file mode 100644 index 0000000000000..b650bc67faeb7 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_host2target_2d.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp new file mode 100644 index 0000000000000..9f2cb1b787902 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_host2target_offset.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp new file mode 100644 index 0000000000000..05922690d99f4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_offsets.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp new file mode 100644 index 0000000000000..1954e2c5bfef8 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_target2host.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp new file mode 100644 index 0000000000000..2c3eaa28e7ad2 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_target2host_2d.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp new file mode 100644 index 0000000000000..22f8934482d5e --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_copy_target2host_offset.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp new file mode 100644 index 0000000000000..9910cc82d6e6b --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/buffer_ordering.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp new file mode 100644 index 0000000000000..a25b0ff4c5a11 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp @@ -0,0 +1,33 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests attempting to begin recording to a graph when recording is +// already in progress on another graph throws an error. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + bool Success = false; + + exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()}; + GraphA.begin_recording(Queue); + + try { + exp_ext::command_graph GraphB{Queue.get_context(), Queue.get_device()}; + GraphB.begin_recording(Queue); + } catch (sycl::exception &E) { + Success = E.code() == errc::invalid; + } + + GraphA.end_recording(); + + assert(Success); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp new file mode 100644 index 0000000000000..af577686832cd --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/dotp_buffer_reduction.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp new file mode 100644 index 0000000000000..756fccdc99611 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp @@ -0,0 +1,71 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests a dotp operation using device USM and an in-order queue. + +#include "../graph_common.hpp" + +int main() { + property_list properties{property::queue::in_order()}; + queue Queue{properties}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + float *Dotp = malloc_device(1, Queue); + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + + auto InitEvent = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; }); + }); + + Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t j = 0; j < N; j++) { + Dotp[0] += X[j] * Z[j]; + } + }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + float Output; + Queue.memcpy(&Output, Dotp, sizeof(float)).wait(); + + assert(Output == dotp_reference_result(N)); + + sycl::free(Dotp, Queue); + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp new file mode 100644 index 0000000000000..2e41e5a85f5b9 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp @@ -0,0 +1,77 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests a dotp operation using device USM and an in-order queue with empty +// nodes. The second run is to check that there are no leaks reported with the +// embedded ZE_DEBUG=4 testing capability. + +#include "../graph_common.hpp" + +int main() { + property_list Properties{property::queue::in_order()}; + queue Queue{Properties}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + float *Dotp = malloc_device(1, Queue); + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + + auto InitEvent = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto Empty1 = Queue.submit([&](handler &) {}); + + auto EventA = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; }); + }); + + auto EventB = Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; }); + }); + + auto Empty2 = Queue.submit([&](handler &) {}); + + Queue.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t j = 0; j < N; j++) { + Dotp[0] += X[j] * Z[j]; + } + }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + float Output; + Queue.memcpy(&Output, Dotp, sizeof(float)).wait(); + + assert(Output == dotp_reference_result(N)); + + sycl::free(Dotp, Queue); + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp new file mode 100644 index 0000000000000..b0d988ca6deda --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp @@ -0,0 +1,75 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests a dotp operation split between 2 in-order queues using device USM. + +#include "../graph_common.hpp" + +int main() { + + property_list properties{property::queue::in_order()}; + queue QueueA{properties}; + queue QueueB{QueueA.get_context(), QueueA.get_device(), properties}; + + exp_ext::command_graph Graph{QueueA.get_context(), QueueA.get_device()}; + + float *Dotp = malloc_device(1, QueueA); + + const size_t N = 10; + float *X = malloc_device(N, QueueA); + float *Y = malloc_device(N, QueueA); + float *Z = malloc_device(N, QueueA); + + Graph.begin_recording(QueueA); + Graph.begin_recording(QueueB); + + QueueA.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + + auto Event = QueueA.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { X[it] = Alpha * X[it] + Beta * Y[it]; }); + }); + + QueueB.submit([&](handler &CGH) { + CGH.depends_on(Event); // needed for cross queue dependency + CGH.parallel_for(range<1>{N}, + [=](id<1> it) { Z[it] = Gamma * Z[it] + Beta * Y[it]; }); + }); + + QueueB.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (size_t j = 0; j < N; j++) { + Dotp[0] += X[j] * Z[j]; + } + }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + QueueA.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + float Output; + QueueA.memcpy(&Output, Dotp, sizeof(float)).wait(); + + assert(Output == dotp_reference_result(N)); + + sycl::free(Dotp, QueueA); + sycl::free(X, QueueA); + sycl::free(Y, QueueA); + sycl::free(Z, QueueA); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp new file mode 100644 index 0000000000000..dab4b34eec79d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/dotp_usm_reduction.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp new file mode 100644 index 0000000000000..edfaab45bb417 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update not yet implemented +// XFAIL: * + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/double_buffer.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp new file mode 100644 index 0000000000000..967cfbaaf58e8 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/empty_node.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp new file mode 100644 index 0000000000000..f1a9ae3e49d1a --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp @@ -0,0 +1,9 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out 2>&1 | FileCheck %s +// +// CHECK: complete + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/event_status_querying.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp new file mode 100644 index 0000000000000..e1d912129951d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp @@ -0,0 +1,28 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// + +// This test checks that an expection is thrown when we try to +// record a graph whose context differs from the queue context. +// We ensure that the exception code matches the expected code. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + context InOrderContext; + + exp_ext::command_graph Graph{InOrderContext, Queue.get_device()}; + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.begin_recording(Queue); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp new file mode 100644 index 0000000000000..cd8fd46a72b6d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_devices.cpp @@ -0,0 +1,46 @@ +// RUN: %{build} -o %t.out +// RUN: %{run-unfiltered-devices} %t.out +// + +// This test checks that an expection is thrown when we try to +// record a graph whose device differs from the queue device. +// We ensure that the exception code matches the expected code. + +#include "../graph_common.hpp" + +int GetLZeroBackend(const sycl::device &Dev) { + // Return 1 if the device backend is "Level_zero" or 0 else. + // 0 does not prevent another device to be picked as a second choice + return Dev.get_backend() == backend::ext_oneapi_level_zero; +} + +int GetOtherBackend(const sycl::device &Dev) { + // Return 1 if the device backend is not "Level_zero" or 0 else. + // 0 does not prevent another device to be picked as a second choice + return Dev.get_backend() != backend::ext_oneapi_level_zero; +} + +int main() { + sycl::device Dev0{GetLZeroBackend}; + sycl::device Dev1{GetOtherBackend}; + + if (Dev0 == Dev1) { + // Skip if we don't have two different devices + std::cout << "Test skipped: the devices are the same" << std::endl; + return 0; + } + + queue Queue{Dev1}; + + exp_ext::command_graph Graph{Queue.get_context(), Dev0}; + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + Graph.begin_recording(Queue); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + assert(ExceptionCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp new file mode 100644 index 0000000000000..84272c20dcd1d --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update not implemented yet +// XFAIL: * + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/executable_graph_update.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp new file mode 100644 index 0000000000000..89d724d106301 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp @@ -0,0 +1,15 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as executable graph update and host tasks both aren't +// implemented. +// XFAIL: * + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/executable_graph_update_ordering" diff --git a/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp new file mode 100644 index 0000000000000..906abbd7277b0 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp @@ -0,0 +1,28 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests the ability to finalize a command graph while it is currently being +// recorded to. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + Graph.begin_recording(Queue); + + try { + Graph.finalize(); + } catch (sycl::exception &E) { + assert(false && "Exception thrown on finalize.\n"); + } + + Graph.end_recording(); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp new file mode 100644 index 0000000000000..ee88c065efccf --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as host tasks are not implemented yet +// XFAIL: * + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/host_task.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp new file mode 100644 index 0000000000000..8a59f12d316b4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/multiple_exec_graphs.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp new file mode 100644 index 0000000000000..4bc6ed9af6976 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/queue_shortcuts.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp new file mode 100644 index 0000000000000..3a702d025b3d3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/repeated_exec.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/return_values.cpp b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp new file mode 100644 index 0000000000000..db733d87c7e54 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp @@ -0,0 +1,34 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests the return values from queue graph functions which change the +// internal queue state. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + bool ChangedState = Graph.end_recording(); + assert(ChangedState == false); + + ChangedState = Graph.begin_recording(Queue); + assert(ChangedState == true); + + ChangedState = Graph.begin_recording(Queue); + assert(ChangedState == false); + + ChangedState = Graph.end_recording(); + assert(ChangedState == true); + + ChangedState = Graph.end_recording(); + assert(ChangedState == false); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/stream.cpp b/sycl/test-e2e/Graph/RecordReplay/stream.cpp new file mode 100644 index 0000000000000..0cf1a4da36712 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/stream.cpp @@ -0,0 +1,30 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Expected fail as sycl::stream is not implemented yet +// XFAIL: * + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/stream.cpp" + +// CHECK-DAG: Val: 1 +// CHECK-DAG: Val: 2 +// CHECK-DAG: Val: 3 +// CHECK-DAG: Val: 4 +// CHECK-DAG: Val: 5 +// CHECK-DAG: Val: 6 +// CHECK-DAG: Val: 7 +// CHECK-DAG: Val: 8 +// CHECK-DAG: Val: 9 +// CHECK-DAG: Val: 10 +// CHECK-DAG: Val: 11 +// CHECK-DAG: Val: 12 +// CHECK-DAG: Val: 13 +// CHECK-DAG: Val: 14 +// CHECK-DAG: Val: 15 +// CHECK-DAG: Val: 16 diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp new file mode 100644 index 0000000000000..62b1d074e4bb9 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph.cpp" \ No newline at end of file diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp new file mode 100644 index 0000000000000..4921e51cc98aa --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph_execute_without_parent.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp new file mode 100644 index 0000000000000..0f21e3b6fefd9 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp @@ -0,0 +1,63 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests adding a sub-graph to an in-order queue. + +#include "../graph_common.hpp" + +int main() { + property_list properties{property::queue::in_order()}; + queue Queue{properties}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + + SubGraph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2.0f; }); + }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] += 0.5f; }); + }); + + SubGraph.end_recording(Queue); + + auto ExecSubGraph = SubGraph.finalize(); + + Graph.begin_recording(Queue); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] = 1.0f; }); + }); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] += 3.0f; }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + float Output; + Queue.memcpy(&Output, X, sizeof(float)).wait(); + + assert(Output == 5.5f); + + sycl::free(X, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp new file mode 100644 index 0000000000000..b8f4bfa3b3b83 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// XFAIL:* +// Submit a graph as a subgraph more than once doesn't yet work. + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph_multiple_submission.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp new file mode 100644 index 0000000000000..6fc2b39efade3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph_nested.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp new file mode 100644 index 0000000000000..f3e58b1ef99ff --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph_reduction.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp new file mode 100644 index 0000000000000..2d2eb17cd7078 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp @@ -0,0 +1,14 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// XFAIL: * +// Subgraph doesn't work properly in second parent graph + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/sub_graph_two_parent_graphs.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp new file mode 100644 index 0000000000000..d0303a90b9bd8 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp @@ -0,0 +1,87 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Fail that needs investigation +// XFAIL: * + +// This test creates a temporary buffer which is used in kernels, but +// destroyed before finalization and execution of the graph. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + using T = int; + + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + std::vector ReferenceC(DataC); + for (unsigned n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceC[i] += (DataA[i] + DataB[i]) + 1; + } + } + + { + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + buffer BufferA{DataA.data(), range<1>{DataA.size()}}; + BufferA.set_write_back(false); + buffer BufferB{DataB.data(), range<1>{DataB.size()}}; + BufferB.set_write_back(false); + buffer BufferC{DataC.data(), range<1>{DataC.size()}}; + BufferC.set_write_back(false); + + Graph.begin_recording(Queue); + + // Create a temporary output buffer to use between kernels. + { + buffer BufferTemp{range<1>{DataA.size()}}; + BufferTemp.set_write_back(false); + + // Vector add to temporary output buffer + Queue.submit([&](handler &CGH) { + auto PtrA = BufferA.get_access(CGH); + auto PtrB = BufferB.get_access(CGH); + auto PtrOut = BufferTemp.get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrOut[id] = PtrA[id] + PtrB[id]; }); + }); + + // Modify temp buffer and write to output buffer + Queue.submit([&](handler &CGH) { + auto PtrTemp = BufferTemp.get_access(CGH); + auto PtrOut = BufferC.get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> id) { PtrOut[id] += PtrTemp[id] + 1; }); + }); + Graph.end_recording(); + } + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.ext_oneapi_graph(GraphExec); + }); + } + Queue.wait_and_throw(); + + host_accessor HostAccC(BufferC); + for (size_t i = 0; i < Size; i++) { + assert(ReferenceC[i] == HostAccC[i]); + } + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp new file mode 100644 index 0000000000000..a51bcc967b2ee --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/temp_buffer_reinterpret.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp new file mode 100644 index 0000000000000..6cc286bc2c96b --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp @@ -0,0 +1,53 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests that a command-group function can capture variables by reference +// and still work correctly as a graph node. + +#include "../graph_common.hpp" + +const size_t N = 10; +const float ExpectedValue = 42.0f; + +void run_some_kernel(queue Queue, float *Data) { + // 'Data' is captured by ref here but will have gone out of scope when the + // CGF is later run when the graph is executed. + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + size_t i = idx; + Data[i] = ExpectedValue; + }); + }); +} + +int main() { + + queue Queue{default_selector_v}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + float *Arr = malloc_device(N, Queue); + + Graph.begin_recording(Queue); + run_some_kernel(Queue, Arr); + Graph.end_recording(Queue); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N * sizeof(float)).wait(); + for (size_t i = 0; i < N; i++) { + assert(Output[i] == ExpectedValue); + } + + sycl::free(Arr, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp new file mode 100644 index 0000000000000..b24dc65614f1e --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_copy.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp new file mode 100644 index 0000000000000..c2b7c786232e4 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp @@ -0,0 +1,83 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests memcpy operation using device USM and an in-order queue. + +#include "../graph_common.hpp" + +int main() { + property_list properties{property::queue::in_order()}; + queue Queue{properties}; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + float *X = malloc_device(N, Queue); + float *Y = malloc_device(N, Queue); + float *Z = malloc_device(N, Queue); + + // Shouldn't be captured in graph as a dependency + Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 0.0f; + Y[it] = 0.0f; + Z[it] = 0.0f; + }); + }); + + Graph.begin_recording(Queue); + + auto InitEvent = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] = 1.0f; + Y[it] = 2.0f; + Z[it] = 3.0f; + }); + }); + Graph.end_recording(Queue); + + // Shouldn't be captured in graph as a dependency + Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { + X[it] += 0.5f; + Y[it] += 0.5f; + Z[it] += 0.5f; + }); + }); + + Graph.begin_recording(Queue); + // memcpy 1 values from X to Y + Queue.submit([&](handler &CGH) { CGH.memcpy(Y, X, N * sizeof(float)); }); + + // Double Y to 2.0 + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> it) { Y[it] *= 2.0f; }); + }); + + // memcpy from 2.0 Y values to Z + Queue.submit([&](handler &CGH) { CGH.memcpy(Z, Y, N * sizeof(float)); }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }); + + std::vector Output(N); + Queue.memcpy(Output.data(), Z, N * sizeof(float)).wait(); + + for (size_t i = 0; i < N; i++) { + assert(Output[i] == 2.0f); + } + + sycl::free(X, Queue); + sycl::free(Y, Queue); + sycl::free(Z, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp new file mode 100644 index 0000000000000..db225c3efa0e3 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_fill.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp new file mode 100644 index 0000000000000..ccd130320a528 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_fill_host.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp new file mode 100644 index 0000000000000..ca6e79542f356 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp @@ -0,0 +1,11 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/usm_fill_shared.cpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp new file mode 100644 index 0000000000000..cd0f5df8946a9 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp @@ -0,0 +1,31 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK + +// Tests obtaining a finalized, executable graph from a graph which is +// currently being recorded to without end_recording() being called. + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + { + queue MyQueue(Queue.get_context(), Queue.get_device()); + Graph.begin_recording(MyQueue); + } + + try { + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } catch (sycl::exception &E) { + assert(false && "Exception thrown on finalize or submission.\n"); + } + Queue.wait(); + return 0; +} diff --git a/sycl/test-e2e/Graph/device_query.cpp b/sycl/test-e2e/Graph/device_query.cpp new file mode 100644 index 0000000000000..fe778fadd7519 --- /dev/null +++ b/sycl/test-e2e/Graph/device_query.cpp @@ -0,0 +1,23 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests the using device query for graphs support, and that the return value +// matches expectations. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + auto Device = Queue.get_device(); + + exp_ext::info::graph_support_level SupportsGraphs = + Device.get_info(); + auto Backend = Device.get_backend(); + + if (Backend == backend::ext_oneapi_level_zero) { + assert(SupportsGraphs == exp_ext::info::graph_support_level::native); + } else { + assert(SupportsGraphs == exp_ext::info::graph_support_level::unsupported); + } +} diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp new file mode 100644 index 0000000000000..90eb43986275a --- /dev/null +++ b/sycl/test-e2e/Graph/empty_graph.cpp @@ -0,0 +1,27 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests the ability to finalize and submit a command graph which doesn't +// contain any nodes. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphExec = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + Queue.wait_and_throw(); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + + return 0; +} diff --git a/sycl/test-e2e/Graph/finalize_twice.cpp b/sycl/test-e2e/Graph/finalize_twice.cpp new file mode 100644 index 0000000000000..31c178e8eeae5 --- /dev/null +++ b/sycl/test-e2e/Graph/finalize_twice.cpp @@ -0,0 +1,25 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests calling finalize() more than once on the same command_graph. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + auto GraphExec = Graph.finalize(); + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphExec2 = Graph.finalize(); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::success); + + return 0; +} diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp new file mode 100644 index 0000000000000..c5ccc1ba6d271 --- /dev/null +++ b/sycl/test-e2e/Graph/graph_common.hpp @@ -0,0 +1,409 @@ +#include + +#include + +#include + +// Test constants. +constexpr size_t Size = 1024; // Number of data elements in a buffer. +constexpr unsigned Iterations = 5; // Iterations of graph to execute. +constexpr size_t Offset = 100; // Number of offset elements for Buffer accessors + +// Namespace alias to use in test code. +namespace exp_ext = sycl::ext::oneapi::experimental; +// Make tests less verbose by using sycl namespace. +using namespace sycl; + +// Helper functions for wrapping depends_on calls when add_node is used so they +// are not used in the explicit API +template inline void depends_on_helper(sycl::handler &CGH, T Dep) { +#ifdef GRAPH_E2E_RECORD_REPLAY + CGH.depends_on(Dep); +#endif + (void)CGH; + (void)Dep; +} + +template +inline void depends_on_helper(sycl::handler &CGH, + std::initializer_list DepList) { +#ifdef GRAPH_E2E_RECORD_REPLAY + CGH.depends_on(DepList); +#endif + (void)CGH; + (void)DepList; +} + +// We have 4 versions of the same kernel sequence for testing a combination +// of graph construction API against memory model. Each submits the same pattern +/// of 4 kernels with a diamond dependency. +// +// | Buffers | USM | +// ----------------|---------------|-------------------| +// Record & Replay | run_kernels() | run_kernels_usm() | +// ----------------|---------------|-------------------| +// Explicit API | add_kernels() | add_kernels_usm() | + +/// Calculates reference data on the host for a given number of executions +/// @param[in] Iterations Number of iterations of kernel sequence to run. +/// @param[in] Size Number of elements in vectors +/// @param[in,out] ReferenceA First input/output. +/// @param[in,out] ReferenceB Second input/output. +/// @param[in,out] ReferenceC Third input/output. +template +void calculate_reference_data(size_t Iterations, size_t Size, + std::vector &ReferenceA, + std::vector &ReferenceB, + std::vector &ReferenceC) { + for (size_t n = 0; n < Iterations; n++) { + for (size_t i = 0; i < Size; i++) { + ReferenceA[i]++; + ReferenceB[i] += ReferenceA[i]; + ReferenceC[i] -= ReferenceA[i]; + ReferenceB[i]--; + ReferenceC[i]--; + } + } +} + +/// Test Record and Replay graph construction with buffers. +/// +/// @param Q Queue to submit nodes to. +/// @param Size Number of elements in the buffers. +/// @param BufferA First input/output to use in kernels. +/// @param BufferB Second input/output to use in kernels. +/// @param BufferC Third input/output to use in kernels. +/// +/// @return An event corresponding to the exit node of the submissions sequence. +template +event run_kernels(queue Q, const size_t Size, buffer BufferA, + buffer BufferB, buffer BufferC) { + // Read & write Buffer A. + Q.submit([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { DataA[Id]++; }); + }); + + // Reads Buffer A. + // Read & Write Buffer B. + Q.submit([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + auto DataB = BufferB.template get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> Id) { DataB[Id] += DataA[Id]; }); + }); + + // Reads Buffer A. + // Read & writes Buffer C + Q.submit([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + auto DataC = BufferC.template get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> Id) { DataC[Id] -= DataA[Id]; }); + }); + + // Read & write Buffers B and C. + auto ExitEvent = Q.submit([&](handler &CGH) { + auto DataB = BufferB.template get_access(CGH); + auto DataC = BufferC.template get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + DataB[Id]--; + DataC[Id]--; + }); + }); + + return ExitEvent; +} + +/// Test Explicit API graph construction with buffers. +/// +/// @param Graph Modifiable graph to add commands to. +/// @param Size Number of elements in the buffers. +/// @param BufferA First input/output to use in kernels. +/// @param BufferB Second input/output to use in kernels. +/// @param BufferC Third input/output to use in kernels. +/// +/// @return Exit node of the submission sequence. +template +exp_ext::node +add_kernels(exp_ext::command_graph Graph, + const size_t Size, buffer BufferA, buffer BufferB, + buffer BufferC) { + // Read & write Buffer A + Graph.add([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { DataA[Id]++; }); + }); + + // Reads Buffer A + // Read & Write Buffer B + Graph.add([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + auto DataB = BufferB.template get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> Id) { DataB[Id] += DataA[Id]; }); + }); + + // Reads Buffer A + // Read & writes Buffer C + Graph.add([&](handler &CGH) { + auto DataA = BufferA.template get_access(CGH); + auto DataC = BufferC.template get_access(CGH); + CGH.parallel_for(range<1>(Size), + [=](item<1> Id) { DataC[Id] -= DataA[Id]; }); + }); + + // Read & write Buffers B and C + auto ExitNode = Graph.add([&](handler &CGH) { + auto DataB = BufferB.template get_access(CGH); + auto DataC = BufferC.template get_access(CGH); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + DataB[Id]--; + DataC[Id]--; + }); + }); + return ExitNode; +} + +//// Test Explicit API graph construction with USM. +/// +/// @param Q Command-queue to make kernel submissions to. +/// @param Size Number of elements in the buffers. +/// @param DataA Pointer to first USM allocation to use in kernels. +/// @param DataB Pointer to second USM allocation to use in kernels. +/// @param DataC Pointer to third USM allocation to use in kernels. +/// +/// @return Event corresponding to the exit node of the submission sequence. +template +event run_kernels_usm(queue Q, const size_t Size, T *DataA, T *DataB, + T *DataC) { + // Read & write Buffer A + auto EventA = Q.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataA[LinID]++; + }); + }); + + // Reads Buffer A + // Read & Write Buffer B + auto EventB = Q.submit([&](handler &CGH) { + CGH.depends_on(EventA); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataB[LinID] += DataA[LinID]; + }); + }); + + // Reads Buffer A + // Read & writes Buffer C + auto EventC = Q.submit([&](handler &CGH) { + CGH.depends_on(EventA); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataC[LinID] -= DataA[LinID]; + }); + }); + + // Read & write Buffers B and C + auto ExitEvent = Q.submit([&](handler &CGH) { + CGH.depends_on({EventB, EventC}); + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataB[LinID]--; + DataC[LinID]--; + }); + }); + return ExitEvent; +} + +/// Test Explicit API graph construction with USM. +/// +/// @param Graph Modifiable graph to add commands to. +/// @param Size Number of elements in the buffers. +/// @param DataA Pointer to first USM allocation to use in kernels. +/// @param DataB Pointer to second USM allocation to use in kernels. +/// @param DataC Pointer to third USM allocation to use in kernels. +/// +/// @return Exit node of the submission sequence. +template +exp_ext::node +add_kernels_usm(exp_ext::command_graph Graph, + const size_t Size, T *DataA, T *DataB, T *DataC) { + // Read & write Buffer A + auto NodeA = Graph.add([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataA[LinID]++; + }); + }); + + // Reads Buffer A + // Read & Write Buffer B + auto NodeB = Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataB[LinID] += DataA[LinID]; + }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // Reads Buffer A + // Read & writes Buffer C + auto NodeC = Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataC[LinID] -= DataA[LinID]; + }); + }, + {exp_ext::property::node::depends_on(NodeA)}); + + // Read & write data B and C + auto ExitNode = Graph.add( + [&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> Id) { + auto LinID = Id.get_linear_id(); + DataB[LinID]--; + DataC[LinID]--; + }); + }, + {exp_ext::property::node::depends_on(NodeB, NodeC)}); + + return ExitNode; +} + +/// Adds a common series of nodes to a graph forming a diamond dependency +/// structure using USM pointers for the inputs. Can be used for either Explicit +/// or Record and Replay with the choice being dictated by defining one of +/// GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY. +/// +/// @param Graph Modifiable graph to add commands to. +/// @param Queue Queue to be used for record and replay. +/// @param Size Number of elements in the buffers. +/// @param DataA Pointer to first USM allocation to use in kernels. +/// @param DataB Pointer to second USM allocation to use in kernels. +/// @param DataC Pointer to third USM allocation to use in kernels. +/// +/// @return If using Explicit API this will be the last node added, if Record +/// and Replay this will be an event corresponding to the last submission. +template +auto add_nodes(exp_ext::command_graph Graph, + queue Queue, const size_t Size, T *DataA, T *DataB, T *DataC) { +#if defined(GRAPH_E2E_EXPLICIT) + return add_kernels_usm(Graph, Size, DataA, DataB, DataC); +#elif defined(GRAPH_E2E_RECORD_REPLAY) + Graph.begin_recording(Queue); + auto ev = run_kernels_usm(Queue, Size, DataA, DataB, DataC); + Graph.end_recording(Queue); + return ev; +#else + assert(0 && "Error: Cannot use add_nodes without selecting an API"); +#endif +} + +/// Adds a common series of nodes to a graph forming a diamond dependency +/// structure using Buffers for the inputs. Can be used for either Explicit +/// or Record and Replay with the choice being dictated by defining one of +/// GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY. +/// +/// @param Graph Modifiable graph to add commands to. +/// @param Queue Queue to be used for record and replay. +/// @param Size Number of elements in the buffers. +/// @param BufferA First input/output to use in kernels. +/// @param BufferB Second input/output to use in kernels. +/// @param BufferC Third input/output to use in kernels. +/// +/// @return If using Explicit API this will be the last node added, if Record +/// and Replay API this will be an event corresponding to the last submission. +template +auto add_nodes(exp_ext::command_graph Graph, + queue Queue, const size_t Size, buffer BufferA, + buffer BufferB, buffer BufferC) { +#if defined(GRAPH_E2E_EXPLICIT) + return add_kernels(Graph, Size, BufferA, BufferB, BufferC); +#elif defined(GRAPH_E2E_RECORD_REPLAY) + Graph.begin_recording(Queue); + auto ev = run_kernels(Queue, Size, BufferA, BufferB, BufferC); + Graph.end_recording(Queue); + return ev; +#else + assert(0 && "Error: Cannot use add_nodes without selecting an API"); +#endif +} + +/// Adds a single node to the graph in an API agnostic way. Can be used for +/// either Explicit or Record and Replay with the choice being dictated by +/// defining one of GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY. +/// +/// @tparam CGFunc Type of the command group function. +/// @tparam DepT Type of all the dependencies. +/// @param Graph Modifiable graph to add commands to. +/// @param Queue Queue to be used for record and replay. +/// @param CGF The command group function representing the node +/// @param Deps Parameter pack of dependencies, if they are Nodes we pass them +/// to explicit API add, otherwise they are ignored. +/// @return If using the Explicit API this will be the node that was added, if +/// Record and Replay this will be an event representing the submission. +template +auto add_node(exp_ext::command_graph Graph, + queue Queue, CGFunc CGF, DepT... Deps) { +#if defined(GRAPH_E2E_EXPLICIT) + if constexpr ((std::is_same_v && ...)) { + return Graph.add(CGF, {exp_ext::property::node::depends_on(Deps...)}); + } else { + return Graph.add(CGF); + } +#elif defined(GRAPH_E2E_RECORD_REPLAY) + Graph.begin_recording(Queue); + auto ev = Queue.submit(CGF); + Graph.end_recording(Queue); + return ev; +#else + assert(0 && "Error: Cannot use add_node without selecting an API"); +#endif +} + +/// Adds an empty node to the graph in an API agnostic way. Can be used for +/// either Explicit or Record and Replay with the choice being dictated by +/// defining one of GRAPH_E2E_EXPLICIT or GRAPH_E2E_RECORD_REPLAY. +/// +/// @tparam DepT Type of all the dependencies. +/// @param Graph Modifiable graph to add commands to. +/// @param Queue Queue to be used for record and replay. +/// @param Deps Parameter pack of dependencies, if they are Nodes we pass them +/// to explicit API add, otherwise they are passed as events to queue::submit() +/// for the empty node submission. +/// @return If using the Explicit API this will be the node that was added, if +/// Record and Replay this will be an event representing the submission. +template +auto add_empty_node( + exp_ext::command_graph Graph, queue Queue, + DepT... Deps) { +#if defined(GRAPH_E2E_EXPLICIT) + if constexpr ((std::is_same_v && ...)) { + return Graph.add({exp_ext::property::node::depends_on(Deps...)}); + } else { + return Graph.add(); + } +#elif defined(GRAPH_E2E_RECORD_REPLAY) + Graph.begin_recording(Queue); + auto ev = Queue.submit( + [&](sycl::handler &CGH) { CGH.depends_on(std::vector{Deps...}); }); + Graph.end_recording(Queue); + return ev; +#else + assert(0 && "Error: Cannot use add_empty_node without selecting an API"); +#endif +} + +// Values for dotp tests +constexpr float Alpha = 1.0f; +constexpr float Beta = 2.0f; +constexpr float Gamma = 3.0f; + +// Reference function for dotp +float dotp_reference_result(size_t N) { + return N * (Alpha * 1.0f + Beta * 2.0f) * (Gamma * 3.0f + Beta * 2.0f); +} diff --git a/sycl/test-e2e/Graph/invalid_depends_on.cpp b/sycl/test-e2e/Graph/invalid_depends_on.cpp new file mode 100644 index 0000000000000..cd9ee51303c12 --- /dev/null +++ b/sycl/test-e2e/Graph/invalid_depends_on.cpp @@ -0,0 +1,79 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests that calling handler::depends_on() for events not part of the graph +// throws. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + ext::oneapi::experimental::command_graph Graph2{Queue.get_context(), + Queue.get_device()}; + + auto NormalEvent = Queue.submit( + [&](handler &CGH) { CGH.single_task([=]() {}); }); + + Graph2.begin_recording(Queue); + + auto OtherGraphEvent = Queue.submit( + [&](handler &CGH) { CGH.single_task([=]() {}); }); + + Graph2.end_recording(Queue); + + Graph.begin_recording(Queue); + + // Test that depends_on in explicit and record and replay throws from an event + // outside any graph. + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphEvent = Queue.submit([&](handler &CGH) { + CGH.depends_on(NormalEvent); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + ErrorCode = make_error_code(sycl::errc::success); + try { + Graph.add([&](handler &CGH) { + CGH.depends_on(NormalEvent); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + // Test that depends_on throws from an event from another graph. + ErrorCode = make_error_code(sycl::errc::success); + try { + auto GraphEvent = Queue.submit([&](handler &CGH) { + CGH.depends_on(OtherGraphEvent); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + ErrorCode = make_error_code(sycl::errc::success); + try { + Graph.add([&](handler &CGH) { + CGH.depends_on(OtherGraphEvent); + CGH.single_task([=]() {}); + }); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/test-e2e/Graph/invalid_event_wait.cpp b/sycl/test-e2e/Graph/invalid_event_wait.cpp new file mode 100644 index 0000000000000..bb7f4ad6bedda --- /dev/null +++ b/sycl/test-e2e/Graph/invalid_event_wait.cpp @@ -0,0 +1,31 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests that waiting on an event returned from a Record and Replay submission +// throws. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + Graph.begin_recording(Queue); + + auto GraphEvent = Queue.submit( + [&](handler &CGH) { CGH.single_task([=]() {}); }); + + Graph.end_recording(Queue); + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + try { + GraphEvent.wait(); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/test-e2e/Graph/invalid_queue_wait.cpp b/sycl/test-e2e/Graph/invalid_queue_wait.cpp new file mode 100644 index 0000000000000..8ba8c7d1c2125 --- /dev/null +++ b/sycl/test-e2e/Graph/invalid_queue_wait.cpp @@ -0,0 +1,26 @@ +// REQUIRES: level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Tests that waiting on a Queue in recording mode throws. + +#include "graph_common.hpp" + +int main() { + queue Queue; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + Graph.begin_recording(Queue); + + std::error_code ErrorCode = make_error_code(sycl::errc::success); + + try { + Queue.wait(); + } catch (const sycl::exception &e) { + ErrorCode = e.code(); + } + assert(ErrorCode == sycl::errc::invalid); + + return 0; +} diff --git a/sycl/test/extensions/macro_graph.cpp b/sycl/test/extensions/macro_graph.cpp new file mode 100644 index 0000000000000..bca987f3ef243 --- /dev/null +++ b/sycl/test/extensions/macro_graph.cpp @@ -0,0 +1,16 @@ +// This test checks presence of macros for available extensions. +// RUN: %clangxx -fsycl -fsyntax-only %s + +#include + +#if SYCL_EXT_ONEAPI_GRAPH == 1 +constexpr bool macro_defined = true; +#else +constexpr bool macro_defined = false; +#endif + +int main() { + static_assert(macro_defined); + + return 0; +} diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index 1193439f196cc..590d86b8e0019 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -37,6 +37,20 @@ class CommandGraphTest : public ::testing::Test { experimental::command_graph Graph; }; +TEST_F(CommandGraphTest, QueueState) { + experimental::queue_state State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::executing); + + experimental::command_graph Graph{Queue.get_context(), Queue.get_device()}; + Graph.begin_recording(Queue); + State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::recording); + + Graph.end_recording(); + State = Queue.ext_oneapi_get_state(); + ASSERT_EQ(State, experimental::queue_state::executing); +} + TEST_F(CommandGraphTest, AddNode) { auto GraphImpl = sycl::detail::getSyclObjImpl(Graph);