diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index 81082031e32b0..173c30e4ceae4 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -99,6 +99,7 @@ class CG { } CG(CG &&CommandGroup) = default; + CG(const CG &CommandGroup) = default; CGTYPE getType() { return MType; } @@ -138,7 +139,7 @@ class CGExecKernel : public CG { public: /// Stores ND-range description. NDRDescT MNDRDesc; - std::unique_ptr MHostKernel; + std::shared_ptr MHostKernel; std::shared_ptr MSyclKernel; std::shared_ptr MKernelBundle; std::vector MArgs; @@ -176,6 +177,8 @@ class CGExecKernel : public CG { "Wrong type of exec kernel CG."); } + CGExecKernel(const CGExecKernel &CGExec) = default; + std::vector getArguments() const { return MArgs; } std::string getKernelName() const { return MKernelName; } std::vector> getStreams() const { diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 1b37c049043ae..376bf5ed75982 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -292,30 +292,6 @@ class RoundedRangeKernelWithKH { using std::enable_if_t; using sycl::detail::queue_impl; -std::shared_ptr createCommandAndEnqueue( - CG::CGTYPE Type, std::shared_ptr Queue, - NDRDescT NDRDesc, std::unique_ptr HostKernel, - std::unique_ptr HostTaskPtr, - std::unique_ptr InteropTask, - std::shared_ptr Kernel, std::string KernelName, - KernelBundleImplPtr KernelBundle, - std::vector> ArgsStorage, - std::vector AccStorage, - std::vector LocalAccStorage, - std::vector> StreamStorage, - std::vector> SharedPtrStorage, - std::vector> AuxiliaryResources, - std::vector Args, void *SrcPtr, void *DstPtr, - size_t Length, std::vector Pattern, size_t SrcPitch, size_t DstPitch, - size_t Width, size_t Height, size_t Offset, bool IsDeviceImageScoped, - const std::string &HostPipeName, void *HostPipePtr, bool HostPipeBlocking, - size_t HostPipeTypeSize, bool HostPipeRead, pi_mem_advice Advice, - std::vector Requirements, - std::vector Events, - std::vector EventsWaitWithBarrier, - detail::OSModuleHandle OSModHandle, - RT::PiKernelCacheConfig KernelCacheConfig, detail::code_location CodeLoc); - } // namespace detail /// Command group handler class. @@ -2903,9 +2879,17 @@ class __SYCL_EXPORT handler { /// The list of valid SYCL events that need to complete /// before barrier command can be executed std::vector MEventsWaitWithBarrier; - + + /// The graph that is associated with this handler. std::shared_ptr MGraph; + /// If we are submitting a graph using ext_oneapi_graph this will be the graph + /// to be executed. + std::shared_ptr + MExecGraph; + /// Storage for a node created from a subgraph submission. std::shared_ptr MSubgraphNode; + /// Storage for the CG created when handling graph nodes added explicitly. + std::unique_ptr MGraphNodeCG; bool MIsHost = false; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 8b590eeb41737..ccd72ae705dcc 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -143,6 +143,7 @@ graph_impl::add(const std::shared_ptr &Impl, const std::vector> &Dep) { sycl::handler Handler{Impl}; CGF(Handler); + Handler.finalize(); // If the handler recorded a subgraph return that here as the relevant nodes // have already been added. The node returned here is an empty node with @@ -150,47 +151,37 @@ graph_impl::add(const std::shared_ptr &Impl, if (Handler.MSubgraphNode) { return Handler.MSubgraphNode; } - - return this->add(Handler.MKernel, Handler.MNDRDesc, Handler.MOSModuleHandle, - Handler.MKernelName, Handler.MAccStorage, - Handler.MLocalAccStorage, Handler.MCGType, Handler.MArgs, - Handler.MImpl->MAuxiliaryResources, Dep, Handler.MEvents); + return this->add(Handler.MCGType, std::move(Handler.MGraphNodeCG), Dep); } -std::shared_ptr graph_impl::add( - std::shared_ptr Kernel, - sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle, - std::string KernelName, - const std::vector &AccStorage, - const std::vector &LocalAccStorage, - sycl::detail::CG::CGTYPE CGType, - const std::vector &Args, - const std::vector> &AuxiliaryResources, - const std::vector> &Dep, - const std::vector> &DepEvents) { - const std::shared_ptr &NodeImpl = std::make_shared( - Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage, LocalAccStorage, - CGType, Args, AuxiliaryResources); +std::shared_ptr +graph_impl::add(sycl::detail::CG::CGTYPE CGType, + std::unique_ptr CommandGroup, + const std::vector> &Dep) { // Copy deps so we can modify them auto Deps = Dep; - // A unique set of dependencies obtained by checking kernel arguments - // for accessors - std::set> UniqueDeps; - for (auto &Arg : Args) { - if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) { - continue; - } - // Look through the graph for nodes which share this argument - for (auto NodePtr : MRoots) { - check_for_arg(Arg, NodePtr, UniqueDeps); + if (CGType == sycl::detail::CG::Kernel) { + // A unique set of dependencies obtained by checking kernel arguments + // for accessors + std::set> UniqueDeps; + const auto &Args = + static_cast(CommandGroup.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) { + continue; + } + // Look through the graph for nodes which share this argument + for (auto NodePtr : MRoots) { + check_for_arg(Arg, NodePtr, UniqueDeps); + } } - } - // Add any deps determined from accessor arguments into the dependency list - Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + // Add any deps determined from accessor arguments into the dependency list + Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); + } // Add any nodes specified by event dependencies into the dependency list - for (auto Dep : DepEvents) { + for (auto Dep : CommandGroup->MEvents) { if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) { Deps.push_back(NodeImpl->second); } else { @@ -200,6 +191,8 @@ std::shared_ptr graph_impl::add( } } + const std::shared_ptr &NodeImpl = + std::make_shared(CGType, std::move(CommandGroup)); if (!Deps.empty()) { for (auto N : Deps) { N->register_successor(NodeImpl, N); // register successor @@ -256,9 +249,9 @@ RT::PiExtSyncPoint exec_graph_impl::enqueue_node_direct( } RT::PiExtSyncPoint NewSyncPoint; pi_int32 Res = sycl::detail::enqueueImpCommandBufferKernel( - Ctx, DeviceImpl, CommandBuffer, Node->MNDRDesc, Node->MArgs, - nullptr /* Kernel bundle ptr */, Node->MKernel, Node->MKernelName, - Node->MOSModuleHandle, Deps, &NewSyncPoint, nullptr); + Ctx, DeviceImpl, CommandBuffer, + *static_cast((Node->MCommandGroup.get())), + Deps, &NewSyncPoint, nullptr); if (Res != pi_result::PI_SUCCESS) { throw sycl::exception(errc::invalid, @@ -271,27 +264,6 @@ RT::PiExtSyncPoint exec_graph_impl::enqueue_node_direct( RT::PiExtSyncPoint exec_graph_impl::enqueue_node( sycl::context Ctx, std::shared_ptr DeviceImpl, RT::PiExtCommandBuffer CommandBuffer, std::shared_ptr Node) { - std::unique_ptr CommandGroup; - switch (Node->MCGType) { - case sycl::detail::CG::Kernel: - CommandGroup.reset(new sycl::detail::CGExecKernel( - Node->MNDRDesc, nullptr /* Host Kernel */, Node->MKernel, - nullptr /* Kernel Bundle */, Node->MArgStorage, Node->MAccStorage, - {} /* Shared pointer storage for copies */, Node->MRequirements, - {} /* Events */, Node->MArgs, Node->MKernelName, Node->MOSModuleHandle, - Node->MStreamStorage, Node->MAuxiliaryResources, Node->MCGType, - {} /* Code Location */)); - break; - - default: - assert(false && "Node types other than kernels are not supported!"); - break; - } - - if (!CommandGroup) - throw sycl::runtime_error( - "Internal Error. Command group cannot be constructed.", - PI_ERROR_INVALID_OPERATION); // Queue which will be used for allocation operations for accessors. auto AllocaQueue = std::make_shared( @@ -305,7 +277,7 @@ RT::PiExtSyncPoint exec_graph_impl::enqueue_node( sycl::detail::EventImplPtr Event = sycl::detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), AllocaQueue, CommandBuffer, Deps); + std::move(Node->getCGCopy()), AllocaQueue, CommandBuffer, Deps); return Event->getSyncPoint(); } @@ -333,7 +305,11 @@ void exec_graph_impl::create_pi_command_buffers(sycl::device D) { // If the node is a kernel with no special requirements we can enqueue it // directly. if (type == sycl::detail::CG::Kernel && - Node->MRequirements.size() + Node->MStreamStorage.size() == 0) { + Node->MCommandGroup->MRequirements.size() + + static_cast( + Node->MCommandGroup.get()) + ->MStreams.size() == + 0) { MPiSyncPoints[Node] = enqueue_node_direct(MContext, DeviceImpl, OutCommandBuffer, Node); } else { @@ -342,8 +318,9 @@ void exec_graph_impl::create_pi_command_buffers(sycl::device D) { } // Append Node requirements to overall graph requirements - MRequirements.insert(MRequirements.end(), Node->MRequirements.begin(), - Node->MRequirements.end()); + MRequirements.insert(MRequirements.end(), + Node->MCommandGroup->MRequirements.begin(), + Node->MCommandGroup->MRequirements.end()); } Res = @@ -412,14 +389,26 @@ sycl::event exec_graph_impl::enqueue( // If the node has no requirements for accessors etc. then we skip the // scheduler and enqueue directly. if (NodeImpl->MCGType == sycl::detail::CG::Kernel && - NodeImpl->MRequirements.size() + NodeImpl->MStreamStorage.size() == 0) { + NodeImpl->MCommandGroup->MRequirements.size() + + static_cast( + NodeImpl->MCommandGroup.get()) + ->MStreams.size() == + 0) { + sycl::detail::CGExecKernel *CG = + static_cast( + NodeImpl->MCommandGroup.get()); auto NewEvent = CreateNewEvent(); RT::PiEvent *OutEvent = &NewEvent->getHandleRef(); - pi_int32 Res = sycl::detail::enqueueImpKernel( - Queue, NodeImpl->MNDRDesc, NodeImpl->MArgs, - nullptr /* TODO: Handle KernelBundles */, NodeImpl->MKernel, - NodeImpl->MKernelName, NodeImpl->MOSModuleHandle, RawEvents, OutEvent, - nullptr /* TODO: Pass mem allocation func for accessors */,PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT /* TODO: Extract from handler*/); + pi_int32 Res = + sycl:: + detail::enqueueImpKernel(Queue, CG->MNDRDesc, CG->MArgs, + nullptr /* TODO: Handle KernelBundles */, + CG->MSyclKernel, CG->MKernelName, + CG->MOSModuleHandle, RawEvents, OutEvent, + nullptr /* TODO: Pass mem allocation func + for accessors */ + , + PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT /* TODO: Extract from handler*/); if (Res != pi_result::PI_SUCCESS) { throw sycl::exception( sycl::errc::kernel, @@ -427,31 +416,17 @@ sycl::event exec_graph_impl::enqueue( } ScheduledEvents.push_back(NewEvent); } else { - auto EventImpl = sycl::detail::createCommandAndEnqueue( - NodeImpl->MCGType, Queue, NodeImpl->MNDRDesc, - nullptr /* HostKernel */, nullptr /* HostTaskPtr */, - nullptr /* InteropTask */, NodeImpl->MKernel, NodeImpl->MKernelName, - nullptr /* KernelBundle */, NodeImpl->MArgStorage, - NodeImpl->MAccStorage, NodeImpl->MLocalAccStorage, - NodeImpl->MStreamStorage, {} /* shared_ptr storage */, - NodeImpl->MAuxiliaryResources, NodeImpl->MArgs, nullptr /* SrcPtr */, - nullptr /* DstPtr */, 0 /* Length */, {} /* Pattern */, - 0 /* SrcPitch */, 0 /* DstPitch */, 0 /* Width */, 0 /* Height */, - 0 /* Offset */, false /* IsDeviceImageScoped */, - {} /* HostPipeName */, nullptr /* HostPipePtr */, - false /* HostPipeBlocking */, 0 /* HostPipeTypeSize */, - false /* HostPipeRead */, {} /* Advice */, NodeImpl->MRequirements, - {} /* Events */, {} /* Events w/ Barrier */, - NodeImpl->MOSModuleHandle, - PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT - /* KernelCacheConfig */, - {} /* CodeLoc */); + + sycl::detail::EventImplPtr EventImpl = + sycl::detail::Scheduler::getInstance().addCG( + std::move(NodeImpl->getCGCopy()), Queue); ScheduledEvents.push_back(EventImpl); } } // Create an event which has all kernel events as dependencies - auto NewEvent = std::make_shared(Queue); + sycl::detail::EventImplPtr NewEvent = + std::make_shared(Queue); NewEvent->setStateIncomplete(); NewEvent->getPreparedDepsEvents() = ScheduledEvents; #endif diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 1db52bbf43a28..d9cf3f0f69f0a 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -37,35 +37,10 @@ class node_impl { /// /// Using weak_ptr here to prevent circular references between nodes. std::vector> MPredecessors; - /// Kernel to be executed by this node. - std::shared_ptr MKernel; - /// Description of the kernel global and local sizes as well as offset. - sycl::detail::NDRDescT MNDRDesc; - /// Module handle for the kernel to be executed. - sycl::detail::OSModuleHandle MOSModuleHandle = - sycl::detail::OSUtil::ExeModuleHandle; - /// Kernel name inside the module. - std::string MKernelName; - - /// Accessor storage for node arguments. - std::vector MAccStorage; - /// Local accessor storage for node arguments. - std::vector MLocalAccStorage; - // Streams associated with the node. - std::vector> MStreamStorage; - /// The list of requirements to the node for the scheduling. - std::vector MRequirements; /// Type of the command-group for the node. sycl::detail::CG::CGTYPE MCGType = sycl::detail::CG::None; - - /// Store arg descriptors for the kernel arguments. - std::vector MArgs; - /// We need to store local copies of the values pointed to by MArgs since they - /// may go out of scope before execution. - std::vector> MArgStorage; - - /// Stores auxiliary resources used by internal operations. - std::vector> MAuxiliaryResources; + /// Command group object which stores all args etc needed to enqueue the node + std::unique_ptr MCommandGroup; /// True if an empty node, false otherwise. bool MIsEmpty = false; @@ -92,46 +67,11 @@ class node_impl { node_impl() : MIsEmpty(true) {} /// Construct a node representing a command-group. - /// @param Kernel Kernel to run when node executes. - /// @param NDRDesc NDRange description for kernel. - /// @param OSModuleHandle Module handle for the kernel to be executed. - /// @param KernelName Name of kernel. - /// @param AccStorage Accessor storage for node arguments. - /// @param LocalAccStorage Local accessor storage for node arguments. /// @param CGType Type of the command-group. - /// @param Args Kernel arguments. - /// @param AuxiliaryResources Auxiliary resources used by internal operations. - node_impl( - std::shared_ptr Kernel, - sycl::detail::NDRDescT NDRDesc, - sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName, - const std::vector &AccStorage, - const std::vector &LocalAccStorage, - sycl::detail::CG::CGTYPE CGType, - const std::vector &Args, - const std::vector> &AuxiliaryResources) - : MKernel(Kernel), MNDRDesc(NDRDesc), MOSModuleHandle(OSModuleHandle), - MKernelName(KernelName), MAccStorage(AccStorage), - MLocalAccStorage(LocalAccStorage), MRequirements(), MCGType(CGType), - MArgs(Args), MArgStorage(), MAuxiliaryResources(AuxiliaryResources) { - - // Need to copy the arg values to node local storage so that they don't go - // out of scope before execution - for (size_t i = 0; i < MArgs.size(); i++) { - auto &CurrentArg = MArgs[i]; - MArgStorage.emplace_back(CurrentArg.MSize); - auto StoragePtr = MArgStorage.back().data(); - if (CurrentArg.MPtr) - std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize); - // Set the arg descriptor to point to the new storage - CurrentArg.MPtr = StoragePtr; - if (CurrentArg.MType == - sycl::detail::kernel_param_kind_t::kind_accessor) { - MRequirements.push_back( - static_cast(CurrentArg.MPtr)); - } - } - } + /// @param CommandGroup The CG which stores the command information for this node. + node_impl(sycl::detail::CG::CGTYPE CGType, + std::unique_ptr &&CommandGroup) + : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {} /// Recursively add nodes to execution stack. /// @param NodeImpl Node to schedule. @@ -153,7 +93,11 @@ class node_impl { /// @param Arg Argument to lookup. /// @return True if \p Arg is used in node, false otherwise. bool has_arg(const sycl::detail::ArgDesc &Arg) { - for (auto &NodeArg : MArgs) { + // TODO: Handle types other than exec kernel + assert(MCGType == sycl::detail::CG::Kernel); + const auto &Args = + static_cast(MCommandGroup.get())->MArgs; + for (auto &NodeArg : Args) { if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) { // Args are actually void** so we need to dereference them to compare // actual values @@ -170,9 +114,80 @@ class node_impl { /// Query if this is an empty node. /// @return True if this is an empty node, false otherwise. bool is_empty() const { return MIsEmpty; } + + /// Get a deep copy of this node's command group + /// @return A unique ptr to the new command group object. + std::unique_ptr getCGCopy() const { + switch (MCGType) { + case sycl::detail::CG::Kernel: + case sycl::detail::CG::RunOnHostIntel: + return createCGCopy(); + case sycl::detail::CG::CodeplayInteropTask: + assert(false); + // TODO: Uncomment this once we implement support for interop task so we can + // test required changes to the CG class. + + // return createCGCopy(); + case sycl::detail::CG::CopyAccToPtr: + case sycl::detail::CG::CopyPtrToAcc: + case sycl::detail::CG::CopyAccToAcc: + return createCGCopy(); + case sycl::detail::CG::Fill: + return createCGCopy(); + case sycl::detail::CG::UpdateHost: + return createCGCopy(); + case sycl::detail::CG::CopyUSM: + return createCGCopy(); + case sycl::detail::CG::FillUSM: + return createCGCopy(); + case sycl::detail::CG::PrefetchUSM: + return createCGCopy(); + case sycl::detail::CG::AdviseUSM: + return createCGCopy(); + case sycl::detail::CG::Copy2DUSM: + return createCGCopy(); + case sycl::detail::CG::Fill2DUSM: + return createCGCopy(); + case sycl::detail::CG::Memset2DUSM: + return createCGCopy(); + case sycl::detail::CG::CodeplayHostTask: + assert(false); + // TODO: Uncomment this once we implement support for host task so we can + // test required changes to the CG class. + + // return createCGCopy(); + case sycl::detail::CG::Barrier: + case sycl::detail::CG::BarrierWaitlist: + return createCGCopy(); + case sycl::detail::CG::CopyToDeviceGlobal: + return createCGCopy(); + case sycl::detail::CG::CopyFromDeviceGlobal: + return createCGCopy(); + case sycl::detail::CG::ReadWriteHostPipe: + return createCGCopy(); + case sycl::detail::CG::ExecCommandBuffer: + assert(false && + "Error: Command graph submission should not be a node in a graph"); + break; + case sycl::detail::CG::None: + assert(false && + "Error: Empty nodes should not be enqueue to a command buffer"); + break; + } + return nullptr; + } + +private: + /// Creates a copy of the node's CG by casting to it's actual type, then using + /// that to copy construct and create a new unique ptr from that copy. + /// @tparam CGT The derived type of the CG. + /// @return A new unique ptr to the copied CG. + template std::unique_ptr createCGCopy() const { + return std::make_unique(*static_cast(MCommandGroup.get())); + } }; -/// Class resenting implementation details of command_graph. +/// Class representing implementation details of command_graph. class graph_impl { public: /// Constructor. @@ -191,30 +206,14 @@ class graph_impl { void remove_root(const std::shared_ptr &Root); /// Create a kernel node in the graph. - /// @param Kernel Kernel to run when node executes. - /// @param NDRDesc NDRange description for kernel. - /// @param OSModuleHandle Module handle for the kernel to be executed. - /// @param KernelName Name of kernel. - /// @param AccStorage Accessor storage for node arguments. - /// @param LocalAccStorage Local accessor storage for node arguments. /// @param CGType Type of the command-group. - /// @param Args Node arguments. - /// @param AuxiliaryResources Auxiliary resources used by internal operations. + /// @param CommandGroup The CG which stores all information for this node. /// @param Dep Dependencies of the created node. - /// @param DepEvents Dependent events of the created node. /// @return Created node in the graph. std::shared_ptr - add(std::shared_ptr Kernel, - sycl::detail::NDRDescT NDRDesc, - sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName, - const std::vector &AccStorage, - const std::vector &LocalAccStorage, - sycl::detail::CG::CGTYPE CGType, - const std::vector &Args, - const std::vector> &AuxiliaryResources, - const std::vector> &Dep = {}, - const std::vector> &DepEvents = - {}); + add(sycl::detail::CG::CGTYPE CGType, + std::unique_ptr CommandGroup, + const std::vector> &Dep = {}); /// Create a CGF node in the graph. /// @param Impl Graph implementation pointer to create a handler with. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 63b938d63bb0c..0380edf345dca 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2236,10 +2236,7 @@ void DispatchNativeKernel(void *Blob) { pi_int32 enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, RT::PiExtCommandBuffer CommandBuffer, - NDRDescT NDRDesc, std::vector Args, - const std::shared_ptr &KernelBundleImplPtr, - const std::shared_ptr &SyclKernel, - const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle, + const CGExecKernel &CommandGroup, std::vector &SyncPoints, RT::PiExtSyncPoint *OutSyncPoint, const std::function &getMemAllocationFunc) { @@ -2249,14 +2246,15 @@ pi_int32 enqueueImpCommandBufferKernel( std::mutex *KernelMutex = nullptr; pi_program PiProgram = nullptr; - auto Kernel = SyclKernel; + auto Kernel = CommandGroup.MSyclKernel; const KernelArgMask *EliminatedArgMask; if (Kernel != nullptr) { PiKernel = Kernel->getHandleRef(); } else { std::tie(PiKernel, KernelMutex, EliminatedArgMask, PiProgram) = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( - OSModuleHandle, ContextImpl, DeviceImpl, KernelName, nullptr); + CommandGroup.MOSModuleHandle, ContextImpl, DeviceImpl, + CommandGroup.MKernelName, nullptr); } auto SetFunc = [&Plugin, &PiKernel, &Ctx, &getMemAllocationFunc]( @@ -2267,12 +2265,15 @@ pi_int32 enqueueImpCommandBufferKernel( , getMemAllocationFunc, Ctx, false, Arg, NextTrueIndex); }; - + // Copy args for modification + auto Args = CommandGroup.MArgs; sycl::detail::applyFuncOnFilteredArgs(EliminatedArgMask, Args, SetFunc); // Remember this information before the range dimensions are reversed - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + const bool HasLocalSize = (CommandGroup.MNDRDesc.LocalSize[0] != 0); + // Copy NDRDesc for modification + auto NDRDesc = CommandGroup.MNDRDesc; // Reverse kernel dims sycl::detail::ReverseRangeDimensionsForKernel(NDRDesc); @@ -2489,24 +2490,17 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { case CG::CGTYPE::Kernel: { CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); - NDRDescT &NDRDesc = ExecKernel->MNDRDesc; - std::vector &Args = ExecKernel->MArgs; - auto getMemAllocationFunc = [this](Requirement *Req) { AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); return AllocaCmd->getMemAllocation(); }; - const std::shared_ptr &SyclKernel = - ExecKernel->MSyclKernel; - const std::string &KernelName = ExecKernel->MKernelName; - const detail::OSModuleHandle &OSModuleHandle = ExecKernel->MOSModuleHandle; - if (!Event) { // Kernel only uses assert if it's non interop one - bool KernelUsesAssert = !(SyclKernel && SyclKernel->isInterop()) && - ProgramManager::getInstance().kernelUsesAssert( - OSModuleHandle, KernelName); + bool KernelUsesAssert = + !(ExecKernel->MSyclKernel && ExecKernel->MSyclKernel->isInterop()) && + ProgramManager::getInstance().kernelUsesAssert( + ExecKernel->MOSModuleHandle, ExecKernel->MKernelName); if (KernelUsesAssert) { Event = &MEvent->getHandleRef(); } @@ -2514,8 +2508,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { RT::PiExtSyncPoint OutSyncPoint; auto result = enqueueImpCommandBufferKernel( MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer, - NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel, KernelName, - OSModuleHandle, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc); + *ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc); MEvent->setSyncPoint(OutSyncPoint); return result; } @@ -2626,15 +2619,12 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { std::vector *CopyReqs = new std::vector(HostTask->MRequirements); - // Not actually a copy, but move. Should be OK as it's not expected that - // MHostKernel will be used elsewhere. - std::unique_ptr *CopyHostKernel = - new std::unique_ptr(std::move(HostTask->MHostKernel)); + std::shared_ptr CopyHostKernel = HostTask->MHostKernel; NDRDescT *CopyNDRDesc = new NDRDescT(HostTask->MNDRDesc); ArgsBlob[0] = (void *)CopyReqs; - ArgsBlob[1] = (void *)CopyHostKernel; + ArgsBlob[1] = (void *)CopyHostKernel.get(); ArgsBlob[2] = (void *)CopyNDRDesc; void **NextArg = ArgsBlob.data() + 3; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 115d9f6a45065..d961ba4e06306 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -719,10 +719,7 @@ class KernelFusionCommand : public Command { // Enqueues a given kernel to a RT::PiExtCommandBuffer pi_int32 enqueueImpCommandBufferKernel( context Ctx, DeviceImplPtr DeviceImpl, RT::PiExtCommandBuffer CommandBuffer, - NDRDescT NDRDesc, std::vector Args, - const std::shared_ptr &KernelBundleImplPtr, - const std::shared_ptr &SyclKernel, - const std::string &KernelName, const detail::OSModuleHandle &OSModuleHandle, + const CGExecKernel &CommandGroup, std::vector &SyncPoints, RT::PiExtSyncPoint *OutSyncPoint, const std::function &getMemAllocationFunc); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 8f20c1bd3dcd5..5a92b0936ba0f 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -40,174 +40,6 @@ bool isDeviceGlobalUsedInKernel(const void *DeviceGlobalPtr) { return DGEntry && !DGEntry->MImageIdentifiers.empty(); } -std::shared_ptr createCommandAndEnqueue( - CG::CGTYPE Type, std::shared_ptr Queue, - NDRDescT NDRDesc, std::unique_ptr HostKernel, - std::unique_ptr HostTaskPtr, - std::unique_ptr InteropTask, - std::shared_ptr Kernel, std::string KernelName, - KernelBundleImplPtr KernelBundle, - std::vector> ArgsStorage, - std::vector AccStorage, - std::vector LocalAccStorage, - std::vector> StreamStorage, - std::vector> SharedPtrStorage, - std::vector> AuxiliaryResources, - std::vector Args, void *SrcPtr, void *DstPtr, - size_t Length, std::vector Pattern, size_t SrcPitch, size_t DstPitch, - size_t Width, size_t Height, size_t Offset, bool IsDeviceImageScoped, - const std::string &HostPipeName, void *HostPipePtr, bool HostPipeBlocking, - size_t HostPipeTypeSize, bool HostPipeRead, pi_mem_advice Advice, - std::vector Requirements, - std::vector Events, - std::vector EventsWaitWithBarrier, - detail::OSModuleHandle OSModHandle, - detail::RT::PiKernelCacheConfig KernelCacheConfig, - detail::code_location CodeLoc) { - std::unique_ptr CommandGroup; - switch (Type) { - case detail::CG::Kernel: - case detail::CG::RunOnHostIntel: { - // Copy kernel name here instead of move so that it's available after - // running of this method by reductions implementation. This allows for - // assert feature to check if kernel uses assertions - CommandGroup.reset(new detail::CGExecKernel( - std::move(NDRDesc), std::move(HostKernel), std::move(Kernel), - std::move(KernelBundle), std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - std::move(Args), KernelName, OSModHandle, std::move(StreamStorage), - std::move(AuxiliaryResources), Type, KernelCacheConfig, CodeLoc)); - break; - } - case detail::CG::CodeplayInteropTask: - CommandGroup.reset(new detail::CGInteropTask( - std::move(InteropTask), std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - Type, CodeLoc)); - break; - case detail::CG::CopyAccToPtr: - case detail::CG::CopyPtrToAcc: - case detail::CG::CopyAccToAcc: - CommandGroup.reset(new detail::CGCopy( - Type, SrcPtr, DstPtr, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - CodeLoc)); - break; - case detail::CG::Fill: - CommandGroup.reset(new detail::CGFill( - std::move(Pattern), DstPtr, std::move(ArgsStorage), - std::move(AccStorage), std::move(SharedPtrStorage), - std::move(Requirements), std::move(Events), CodeLoc)); - break; - case detail::CG::UpdateHost: - CommandGroup.reset(new detail::CGUpdateHost( - DstPtr, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - CodeLoc)); - break; - case detail::CG::CopyUSM: - CommandGroup.reset(new detail::CGCopyUSM( - SrcPtr, DstPtr, Length, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - CodeLoc)); - break; - case detail::CG::FillUSM: - CommandGroup.reset(new detail::CGFillUSM( - std::move(Pattern), DstPtr, Length, std::move(ArgsStorage), - std::move(AccStorage), std::move(SharedPtrStorage), - std::move(Requirements), std::move(Events), CodeLoc)); - break; - case detail::CG::PrefetchUSM: - CommandGroup.reset(new detail::CGPrefetchUSM( - DstPtr, Length, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - CodeLoc)); - break; - case detail::CG::AdviseUSM: - CommandGroup.reset(new detail::CGAdviseUSM( - DstPtr, Length, Advice, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - Type, CodeLoc)); - break; - case detail::CG::Copy2DUSM: - CommandGroup.reset(new detail::CGCopy2DUSM( - SrcPtr, DstPtr, SrcPitch, DstPitch, Width, - Height, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events), CodeLoc)); - break; - case detail::CG::Fill2DUSM: - CommandGroup.reset(new detail::CGFill2DUSM( - std::move(Pattern), DstPtr, DstPitch, Width, - Height, std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events), CodeLoc)); - break; - case detail::CG::Memset2DUSM: - CommandGroup.reset(new detail::CGMemset2DUSM( - Pattern[0], DstPtr, DstPitch, Width, Height, - std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events), CodeLoc)); - break; - case detail::CG::CodeplayHostTask: - CommandGroup.reset(new detail::CGHostTask( - std::move(HostTaskPtr), Queue, Queue->getContextImplPtr(), - std::move(Args), std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), - Type, CodeLoc)); - break; - case detail::CG::Barrier: - case detail::CG::BarrierWaitlist: - CommandGroup.reset(new detail::CGBarrier( - std::move(EventsWaitWithBarrier), std::move(ArgsStorage), - std::move(AccStorage), std::move(SharedPtrStorage), - std::move(Requirements), std::move(Events), Type, CodeLoc)); - break; - case detail::CG::CopyToDeviceGlobal: { - CommandGroup.reset(new detail::CGCopyToDeviceGlobal( - SrcPtr, DstPtr, IsDeviceImageScoped, Length, Offset, - std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events), OSModHandle, CodeLoc)); - break; - } - case detail::CG::CopyFromDeviceGlobal: { - CommandGroup.reset(new detail::CGCopyFromDeviceGlobal( - SrcPtr, DstPtr, IsDeviceImageScoped, Length, Offset, - std::move(ArgsStorage), std::move(AccStorage), - std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events), OSModHandle, CodeLoc)); - break; - } - case detail::CG::ReadWriteHostPipe: { - CommandGroup.reset(new detail::CGReadWriteHostPipe( - HostPipeName, HostPipeBlocking, HostPipePtr, - HostPipeTypeSize, HostPipeRead, std::move(ArgsStorage), - std::move(AccStorage), std::move(SharedPtrStorage), - std::move(Requirements), std::move(Events), CodeLoc)); - break; - } - case detail::CG::ExecCommandBuffer: - assert(false && "Error: Command graph submission should not be finalized"); - break; - case detail::CG::None: - if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL)) { - std::cout << "WARNING: An empty command group is submitted." << std::endl; - } - return std::make_shared(); - } - - if (!CommandGroup) - throw sycl::runtime_error( - "Internal Error. Command group cannot be constructed.", - PI_ERROR_INVALID_OPERATION); - - detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( - std::move(CommandGroup), std::move(Queue)); - - return Event; -} } // namespace detail handler::handler(std::shared_ptr Queue, bool IsHost) @@ -278,29 +110,16 @@ event handler::finalize() { if (MIsFinalized) return MLastEvent; MIsFinalized = true; - // If the queue has a graph impl we are in recording mode - if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) { - auto EventImpl = std::make_shared(); - // If we have a subgraph node that means that a subgraph was recorded as - // part of this queue submission, so we skip adding a new node here since - // they have already been added, and return the event associated with the - // subgraph node. - if (MSubgraphNode) { + // If we have a subgraph node that means that a subgraph was recorded as + // part of this queue submission, so we skip adding a new node here since + // they have already been added, and return the event associated with the + // subgraph node. + if (MQueue && MQueue->getCommandGraph() && MSubgraphNode) { + { return detail::createSyclObjFromImpl( - GraphImpl->get_event_for_node(MSubgraphNode)); + MQueue->getCommandGraph()->get_event_for_node(MSubgraphNode)); } - // Extract relevant data from the handler and pass to graph to create a new - // node representing this command group. - auto NodeImpl = - GraphImpl->add(MKernel, MNDRDesc, MOSModuleHandle, MKernelName, - MAccStorage, MLocalAccStorage, MCGType, MArgs, - MImpl->MAuxiliaryResources, {}, MEvents); - - // Create and associated an event with this new node - GraphImpl->add_event_for_node(EventImpl, NodeImpl); - - return detail::createSyclObjFromImpl(EventImpl); } // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed @@ -376,7 +195,8 @@ event handler::finalize() { } } - if (!MQueue->is_in_fusion_mode() && + if (MQueue && !MQueue->getCommandGraph() && !MGraph && !MSubgraphNode && + !MQueue->is_in_fusion_mode() && MRequirements.size() + MEvents.size() + MStreamStorage.size() == 0) { // if user does not add a new dependency to the dependency graph, i.e. // the graph is not changed, and the queue is not in fusion mode, then @@ -448,22 +268,185 @@ event handler::finalize() { return MLastEvent; } } - detail::EventImplPtr EventImpl = createCommandAndEnqueue( - type, MQueue, MNDRDesc, std::move(MHostKernel), std::move(MHostTask), - std::move(MInteropTask), std::move(MKernel), MKernelName, - std::move(MImpl->MKernelBundle), std::move(MArgsStorage), - std::move(MAccStorage), std::move(MLocalAccStorage), - std::move(MStreamStorage), std::move(MSharedPtrStorage), - std::move(MImpl->MAuxiliaryResources), std::move(MArgs), MSrcPtr, MDstPtr, - MLength, std::move(MPattern), MImpl->MSrcPitch, MImpl->MDstPitch, - MImpl->MWidth, MImpl->MHeight, MImpl->MOffset, - MImpl->MIsDeviceImageScoped, MImpl->HostPipeName, MImpl->HostPipePtr, - MImpl->HostPipeBlocking, MImpl->HostPipeTypeSize, MImpl->HostPipeRead, - MImpl->MAdvice, std::move(MRequirements), std::move(MEvents), - std::move(MEventsWaitWithBarrier), MOSModuleHandle, - MImpl->MKernelCacheConfig, MCodeLoc); - - MLastEvent = detail::createSyclObjFromImpl(EventImpl); + + std::unique_ptr CommandGroup; + switch (type) { + case detail::CG::Kernel: + case detail::CG::RunOnHostIntel: { + // Copy kernel name here instead of move so that it's available after + // running of this method by reductions implementation. This allows for + // assert feature to check if kernel uses assertions + CommandGroup.reset(new detail::CGExecKernel( + std::move(MNDRDesc), std::move(MHostKernel), std::move(MKernel), + std::move(MImpl->MKernelBundle), std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), std::move(MArgs), + MKernelName, MOSModuleHandle, std::move(MStreamStorage), + std::move(MImpl->MAuxiliaryResources), MCGType, + MImpl->MKernelCacheConfig, MCodeLoc)); + break; + } + case detail::CG::CodeplayInteropTask: + CommandGroup.reset(new detail::CGInteropTask( + std::move(MInteropTask), std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); + break; + case detail::CG::CopyAccToPtr: + case detail::CG::CopyPtrToAcc: + case detail::CG::CopyAccToAcc: + CommandGroup.reset(new detail::CGCopy( + MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCodeLoc)); + break; + case detail::CG::Fill: + CommandGroup.reset(new detail::CGFill( + std::move(MPattern), MDstPtr, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCodeLoc)); + break; + case detail::CG::UpdateHost: + CommandGroup.reset(new detail::CGUpdateHost( + MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCodeLoc)); + break; + case detail::CG::CopyUSM: + CommandGroup.reset(new detail::CGCopyUSM( + MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCodeLoc)); + break; + case detail::CG::FillUSM: + CommandGroup.reset(new detail::CGFillUSM( + std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCodeLoc)); + break; + case detail::CG::PrefetchUSM: + CommandGroup.reset(new detail::CGPrefetchUSM( + MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCodeLoc)); + break; + case detail::CG::AdviseUSM: + CommandGroup.reset(new detail::CGAdviseUSM( + MDstPtr, MLength, MImpl->MAdvice, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); + break; + case detail::CG::Copy2DUSM: + CommandGroup.reset(new detail::CGCopy2DUSM( + MSrcPtr, MDstPtr, MImpl->MSrcPitch, MImpl->MDstPitch, MImpl->MWidth, + MImpl->MHeight, std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCodeLoc)); + break; + case detail::CG::Fill2DUSM: + CommandGroup.reset(new detail::CGFill2DUSM( + std::move(MPattern), MDstPtr, MImpl->MDstPitch, MImpl->MWidth, + MImpl->MHeight, std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCodeLoc)); + break; + case detail::CG::Memset2DUSM: + CommandGroup.reset(new detail::CGMemset2DUSM( + MPattern[0], MDstPtr, MImpl->MDstPitch, MImpl->MWidth, MImpl->MHeight, + std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCodeLoc)); + break; + case detail::CG::CodeplayHostTask: + CommandGroup.reset(new detail::CGHostTask( + std::move(MHostTask), MQueue, MQueue->getContextImplPtr(), + std::move(MArgs), std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MCGType, MCodeLoc)); + break; + case detail::CG::Barrier: + case detail::CG::BarrierWaitlist: + CommandGroup.reset(new detail::CGBarrier( + std::move(MEventsWaitWithBarrier), std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); + break; + case detail::CG::CopyToDeviceGlobal: { + CommandGroup.reset(new detail::CGCopyToDeviceGlobal( + MSrcPtr, MDstPtr, MImpl->MIsDeviceImageScoped, MLength, MImpl->MOffset, + std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MOSModuleHandle, MCodeLoc)); + break; + } + case detail::CG::CopyFromDeviceGlobal: { + CommandGroup.reset(new detail::CGCopyFromDeviceGlobal( + MSrcPtr, MDstPtr, MImpl->MIsDeviceImageScoped, MLength, MImpl->MOffset, + std::move(MArgsStorage), std::move(MAccStorage), + std::move(MSharedPtrStorage), std::move(MRequirements), + std::move(MEvents), MOSModuleHandle, MCodeLoc)); + break; + } + case detail::CG::ReadWriteHostPipe: { + CommandGroup.reset(new detail::CGReadWriteHostPipe( + MImpl->HostPipeName, MImpl->HostPipeBlocking, MImpl->HostPipePtr, + MImpl->HostPipeTypeSize, MImpl->HostPipeRead, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCodeLoc)); + break; + } + case detail::CG::ExecCommandBuffer: + // If we have a subgraph node we don't want to actually execute this command + // graph submission. + if (!MSubgraphNode) { + event GraphCompletionEvent = MExecGraph->exec(MQueue); + MLastEvent = GraphCompletionEvent; + return MLastEvent; + } + break; + case detail::CG::None: + if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL)) { + std::cout << "WARNING: An empty command group is submitted." << std::endl; + } + detail::EventImplPtr Event = std::make_shared(); + MLastEvent = detail::createSyclObjFromImpl(Event); + return MLastEvent; + } + + if (!MSubgraphNode && !CommandGroup) + throw sycl::runtime_error( + "Internal Error. Command group cannot be constructed.", + PI_ERROR_INVALID_OPERATION); + + // If there is a graph associated with the handler we are in the explicit + // graph mode, so we store the CG instead of submitting it to the scheduler, + // so it can be retrieved by the graph later. + if (MGraph) { + MGraphNodeCG = std::move(CommandGroup); + return detail::createSyclObjFromImpl( + std::make_shared()); + } + + // If the queue has an associated graph then we need to take the CG and pass + // it to the graph to create a node, rather than submit it to the scheduler. + if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) { + auto EventImpl = std::make_shared(); + + // Extract relevant data from the handler and pass to graph to create a + // new node representing this command group. + std::shared_ptr NodeImpl = + GraphImpl->add(MCGType, std::move(CommandGroup)); + + // Associate an event with this new node and return the event. + GraphImpl->add_event_for_node(EventImpl, NodeImpl); + + return detail::createSyclObjFromImpl(EventImpl); + } + + detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( + std::move(CommandGroup), std::move(MQueue)); + + MLastEvent = detail::createSyclObjFromImpl(Event); return MLastEvent; } @@ -1040,6 +1023,7 @@ void handler::ext_oneapi_graph( ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> Graph) { + MCGType = detail::CG::ExecCommandBuffer; auto GraphImpl = detail::getSyclObjImpl(Graph); std::shared_ptr ParentGraph; if (MQueue) { @@ -1057,9 +1041,8 @@ void handler::ext_oneapi_graph( auto SubgraphEvent = std::make_shared(); ParentGraph->add_event_for_node(SubgraphEvent, MSubgraphNode); } else { - auto GraphCompletionEvent = GraphImpl->exec(MQueue); - auto EventImpl = detail::getSyclObjImpl(GraphCompletionEvent); - MLastEvent = GraphCompletionEvent; + // Set the exec graph for execution during finalize. + MExecGraph = GraphImpl; } }