diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d6dad42b68b34..8f093f07d6009 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -243,6 +243,8 @@ class DispatchHostTask { for (const DepDesc &Dep : Deps) Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement); + + Sched.enqueueHostTasksUnlocked(); } } }; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 2086b74d0f273..01be39a9729cf 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -108,10 +108,12 @@ static void unmarkVisitedNodes(std::vector &Visited) { Cmd->MMarks.MVisited = false; } -static void handleVisitedNodes(std::vector &Visited) { +void +Scheduler::GraphBuilder::handleVisitedNodes(std::vector &Visited) { for (Command *Cmd : Visited) { if (Cmd->MMarks.MToBeDeleted) { Cmd->getEvent()->setCommand(nullptr); + Scheduler::getInstance().removeHostTaskCommandUnlocked(Cmd); delete Cmd; } else Cmd->MMarks.MVisited = false; @@ -803,9 +805,11 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NewCmd->addDep(e); } - if (CGType == CG::CGTYPE::CODEPLAY_HOST_TASK) + if (CGType == CG::CGTYPE::CODEPLAY_HOST_TASK) { NewCmd->MEmptyCmd = addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue, Command::BlockReason::HostTask); + Scheduler::getInstance().addHostTaskCommandUnlocked(NewCmd.get()); + } if (MPrintOptionsArray[AfterAddCG]) printGraphAsDot("after_addCG"); diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c2a0c3fbbb509..3945879f439e9 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -286,6 +286,32 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { return Req->MSYCLMemObj->MRecord.get(); } +void Scheduler::addHostTaskCommandUnlocked(Command *Cmd) { + HostTaskCommandXRefT XRef = HostTaskCmds.insert(HostTaskCmds.end(), Cmd); + HostTaskCmdXRefs[Cmd] = XRef; +} + +void Scheduler::removeHostTaskCommandUnlocked(Command *Cmd) { + auto It = HostTaskCmdXRefs.find(Cmd); + + if (It == HostTaskCmdXRefs.end()) + return; + + HostTaskCommandXRefT &XRef = It->second; + HostTaskCmds.erase(XRef); + + HostTaskCmdXRefs.erase(It); +} + +void Scheduler::enqueueHostTasksUnlocked() { + for (Command *Cmd : HostTaskCmds) { + EnqueueResultT Res; + bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); + } +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index d6dec5e599f8c..5cb582070c0fa 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include /// \defgroup sycl_graph DPC++ Execution Graph @@ -585,6 +586,8 @@ class Scheduler { private: friend class ::MockScheduler; + static void handleVisitedNodes(std::vector &Visited); + /// Searches for suitable alloca in memory record. /// /// If none found, creates new one. @@ -743,6 +746,23 @@ class Scheduler { friend class stream_impl; + // List of host-task commands. This data structure is employed to overcome + // certain use-cases with deadlocks involving host-task. The use of this list + // is to enqueue (if possible) host-tasks when another host task is finished. + // List is used in order to remain the order of host-tasks unchanged. + // A map is employed to allow for quick lookup and removal of host-task + // command upon cleanup. + // Access to this data structure is guarded with graph read-write lock. + using HostTaskCommandsT = std::list; + using HostTaskCommandXRefT = HostTaskCommandsT::iterator; + HostTaskCommandsT HostTaskCmds; + std::unordered_map HostTaskCmdXRefs; + + void addHostTaskCommandUnlocked(Command *Cmd); + void removeHostTaskCommandUnlocked(Command *Cmd); + void enqueueHostTasksUnlocked(); + + // Protects stream buffers pool std::mutex StreamBuffersPoolMutex; std::map StreamBuffersPool; diff --git a/sycl/test/host-interop-task/host-task.cpp b/sycl/test/host-interop-task/host-task.cpp index ca355bcb4b654..0a3a30b6f7944 100644 --- a/sycl/test/host-interop-task/host-task.cpp +++ b/sycl/test/host-interop-task/host-task.cpp @@ -11,13 +11,26 @@ // RUN: %GPU_RUN_PLACEHOLDER %t.out 3 // RUN: %ACC_RUN_PLACEHOLDER %t.out 3 -// RUNx: %CPU_RUN_PLACEHOLDER %t.out 4 -// RUNx: %GPU_RUN_PLACEHOLDER %t.out 4 -// RUNx: %ACC_RUN_PLACEHOLDER %t.out 4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out 4 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 4 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 4 + +// RUN: %CPU_RUN_PLACEHOLDER %t.out 5 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 5 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 5 + +// RUN: %CPU_RUN_PLACEHOLDER %t.out 6 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 6 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 6 + +// RUN: %CPU_RUN_PLACEHOLDER %t.out 7 +// RUN: %GPU_RUN_PLACEHOLDER %t.out 7 +// RUN: %ACC_RUN_PLACEHOLDER %t.out 7 #include #include #include +#include #include using namespace cl::sycl; @@ -103,7 +116,6 @@ void test3() { std::vector Deps; - using namespace std::chrono_literals; static constexpr size_t Count = 10; auto Start = std::chrono::steady_clock::now(); @@ -146,6 +158,7 @@ void test3() { Q.wait_and_throw(); auto End = std::chrono::steady_clock::now(); + using namespace std::chrono_literals; constexpr auto Threshold = 2s; assert(End - Start < Threshold && "Host tasks were waiting for too long"); @@ -153,7 +166,7 @@ void test3() { // Host-task depending on another host-task via handler::depends_on() only // should not hang -void test4() { +void test4(size_t Count = 1) { queue Q(EH); static constexpr size_t BufferSize = 10 * 1024; @@ -165,51 +178,150 @@ void test4() { buffer B4{range<1>{BufferSize}}; buffer B5{range<1>{BufferSize}}; - // This host task should be submitted without hesitation - event E1 = Q.submit([&](handler &CGH) { - std::cout << "Submit 1" << std::endl; + for (size_t Idx = 1; Idx <= Count; ++Idx) { + // This host task should be submitted without hesitation + event E1 = Q.submit([&](handler &CGH) { + std::cout << "Submit 1" << std::endl; - auto Acc0 = B0.get_access(CGH); - auto Acc1 = B1.get_access(CGH); - auto Acc2 = B2.get_access(CGH); + auto Acc0 = B0.get_access(CGH); + auto Acc1 = B1.get_access(CGH); + auto Acc2 = B2.get_access(CGH); - CGH.codeplay_host_task([=] { - Acc0[0] = 1; - Acc1[0] = 2; - Acc2[0] = 3; + CGH.codeplay_host_task([=] { + Acc0[0] = 1 * Idx; + Acc1[0] = 2 * Idx; + Acc2[0] = 3 * Idx; + }); }); - }); - // This host task is going to depend on blocked empty node of the first - // host-task (via buffer #2). Still this one should be enqueued. - event E2 = Q.submit([&](handler &CGH) { - std::cout << "Submit 2" << std::endl; + // This host task is going to depend on blocked empty node of the first + // host-task (via buffer #2). Still this one should be enqueued. + event E2 = Q.submit([&](handler &CGH) { + std::cout << "Submit 2" << std::endl; - auto Acc2 = B2.get_access(CGH); - auto Acc3 = B3.get_access(CGH); + auto Acc2 = B2.get_access(CGH); + auto Acc3 = B3.get_access(CGH); - CGH.codeplay_host_task([=] { - Acc2[1] = 1; - Acc3[1] = 2; + CGH.codeplay_host_task([=] { + Acc2[1] = 1 * Idx; + Acc3[1] = 2 * Idx; + }); }); - }); - // This host-task only depends on the second host-task via - // handler::depends_on(). This one should not hang and should be enqueued - // after host-task #2. - event E3 = Q.submit([&](handler &CGH) { - CGH.depends_on(E2); + // This host-task only depends on the second host-task via + // handler::depends_on(). This one should not hang and should be eexecuted + // after host-task #2. + event E3 = Q.submit([&](handler &CGH) { + CGH.depends_on(E2); - std::cout << "Submit 3" << std::endl; + std::cout << "Submit 3" << std::endl; - auto Acc4 = B4.get_access(CGH); - auto Acc5 = B5.get_access(CGH); + auto Acc4 = B4.get_access(CGH); + auto Acc5 = B5.get_access(CGH); - CGH.codeplay_host_task([=] { - Acc4[2] = 1; - Acc5[2] = 2; + CGH.codeplay_host_task([=] { + Acc4[2] = 1 * Idx; + Acc5[2] = 2 * Idx; + }); }); - }); + } + + Q.wait_and_throw(); +} + +// Host-task depending on another host-task via handler::depends_on() only +// should not hang. A bit more complicated case with kernels depending on +// host-task being involved. +void test5(size_t Count = 1) { + queue Q(EH); + + static constexpr size_t BufferSize = 10 * 1024; + + buffer B0{range<1>{BufferSize}}; + buffer B1{range<1>{BufferSize}}; + buffer B2{range<1>{BufferSize}}; + buffer B3{range<1>{BufferSize}}; + buffer B4{range<1>{BufferSize}}; + buffer B5{range<1>{BufferSize}}; + + using namespace std::chrono_literals; + + for (size_t Idx = 1; Idx <= Count; ++Idx) { + // This host task should be submitted without hesitation + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-1" << std::endl; + + auto Acc0 = B0.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(2s); + Acc0[0] = 1 * Idx; + }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit Kernel-1" << std::endl; + + auto Acc0 = B0.get_access(CGH); + + CGH.single_task([=] { + Acc0[1] = 1 * Idx; + }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit Kernel-2" << std::endl; + + auto Acc1 = B1.get_access(CGH); + + CGH.single_task([=] { + Acc1[2] = 1 * Idx; + }); + }); + + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-2" << std::endl; + + auto Acc2 = B2.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(2s); + Acc2[3] = 1 * Idx; + }); + }); + + // This host task is going to depend on blocked empty node of the second + // host-task (via buffer #0). Still this one should be enqueued. + event EHT3 = Q.submit([&](handler &CGH) { + std::cout << "Submit HT-3" << std::endl; + + auto Acc0 = B0.get_access(CGH); + auto Acc1 = B1.get_access(CGH); + auto Acc2 = B2.get_access(CGH); + + CGH.codeplay_host_task([=] { + std::this_thread::sleep_for(2s); + Acc0[4] = 1 * Idx; + Acc1[4] = 2 * Idx; + Acc2[4] = 3 * Idx; + }); + }); + + // This host-task only depends on the third host-task via + // handler::depends_on(). This one should not hang and should be executed + // after host-task #3. + Q.submit([&](handler &CGH) { + std::cout << "Submit HT-4" << std::endl; + + CGH.depends_on(EHT3); + + auto Acc5 = B5.get_access(CGH); + + CGH.codeplay_host_task([=] { + Acc5[5] = 1 * Idx; + }); + }); + } Q.wait_and_throw(); } @@ -233,6 +345,15 @@ int main(int Argc, const char *Argv[]) { case 4: test4(); break; + case 5: + test5(); + break; + case 6: + test4(10); + break; + case 7: + test5(10); + break; default: return 1; }