diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 8dbe84d10e209..fe7868204cf30 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -43,6 +43,7 @@ class interop_handler { template friend class accessor; + public: using QueueImplPtr = std::shared_ptr; using ReqToMem = std::pair; @@ -307,8 +308,7 @@ class HostKernel : public HostKernelBase { template typename std::enable_if>::value>::type runOnHost(const NDRDescT &NDRDesc) { - sycl::range GroupSize( - InitializedVal::template get<0>()); + sycl::range GroupSize(InitializedVal::template get<0>()); for (int I = 0; I < Dims; ++I) { if (NDRDesc.LocalSize[I] == 0 || NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0) @@ -317,8 +317,7 @@ class HostKernel : public HostKernelBase { GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } - sycl::range LocalSize( - InitializedVal::template get<0>()); + sycl::range LocalSize(InitializedVal::template get<0>()); sycl::range GlobalSize( InitializedVal::template get<0>()); sycl::id GlobalOffset; @@ -359,10 +358,9 @@ class HostKernel : public HostKernelBase { NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } - sycl::range LocalSize( - InitializedVal::template get<0>()); + sycl::range LocalSize(InitializedVal::template get<0>()); sycl::range GlobalSize( - InitializedVal::template get<0>()); + InitializedVal::template get<0>()); for (int I = 0; I < Dims; ++I) { LocalSize[I] = NDRDesc.LocalSize[I]; GlobalSize[I] = NDRDesc.GlobalSize[I]; @@ -378,10 +376,10 @@ class HostKernel : public HostKernelBase { }; class stream_impl; -// The base class for all types of command groups. +/// Base class for all types of command groups. class CG { public: - // Type of the command group. + /// Type of the command group. enum CGTYPE { NONE, KERNEL, @@ -425,20 +423,20 @@ class CG { private: CGTYPE MType; - // The following storages needed to ensure that arguments won't die while + // The following storages are needed to ensure that arguments won't die while // we are using them. - // Storage for standard layout arguments. + /// Storage for standard layout arguments. vector_class> MArgsStorage; - // Storage for accessors. + /// Storage for accessors. vector_class MAccStorage; - // Storage for shared_ptrs. + /// Storage for shared_ptrs. vector_class> MSharedPtrStorage; public: - // List of requirements that specify which memory is needed for the command - // group to be executed. + /// List of requirements that specify which memory is needed for the command + /// group to be executed. vector_class MRequirements; - // List of events that order the execution of this CG + /// List of events that order the execution of this CG vector_class MEvents; // Member variables to capture the user code-location // information from Q.submit(), Q.parallel_for() etc @@ -448,9 +446,10 @@ class CG { int32_t MLine, MColumn; }; -// The class which represents "execute kernel" command group. +/// "Execute kernel" command group class. class CGExecKernel : public CG { public: + /// Stores ND-range description. NDRDescT MNDRDesc; unique_ptr_class MHostKernel; shared_ptr_class MSyclKernel; @@ -488,7 +487,7 @@ class CGExecKernel : public CG { } }; -// The class which represents "copy" command group. +/// "Copy memory" command group class. class CGCopy : public CG { void *MSrc; void *MDst; @@ -509,7 +508,7 @@ class CGCopy : public CG { void *getDst() { return MDst; } }; -// The class which represents "fill" command group. +/// "Fill memory" command group class. class CGFill : public CG { public: vector_class MPattern; @@ -529,7 +528,7 @@ class CGFill : public CG { Requirement *getReqToFill() { return MPtr; } }; -// The class which represents "update host" command group. +/// "Update host" command group class. class CGUpdateHost : public CG { Requirement *MPtr; @@ -548,7 +547,7 @@ class CGUpdateHost : public CG { Requirement *getReqToUpdate() { return MPtr; } }; -// The class which represents "copy" command group for USM pointers. +/// "Copy USM" command group class. class CGCopyUSM : public CG { void *MSrc; void *MDst; @@ -572,7 +571,7 @@ class CGCopyUSM : public CG { size_t getLength() { return MLength; } }; -// The class which represents "fill" command group for USM pointers. +/// "Fill USM" command group class. class CGFillUSM : public CG { vector_class MPattern; void *MDst; @@ -595,7 +594,7 @@ class CGFillUSM : public CG { int getFill() { return MPattern[0]; } }; -// The class which represents "prefetch" command group for USM pointers. +/// "Prefetch USM" command group class. class CGPrefetchUSM : public CG { void *MDst; size_t MLength; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8cbe2ccf8ff98..80feb92ce9560 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -38,7 +38,7 @@ class ReleaseCommand; enum BlockingT { NON_BLOCKING = 0, BLOCKING }; -// The struct represents the result of command enqueueing +/// Result of command enqueueing. struct EnqueueResultT { enum ResultT { SyclEnqueueReady, @@ -49,15 +49,15 @@ struct EnqueueResultT { EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr, cl_int ErrCode = CL_SUCCESS) : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {} - // Indicates result of enqueueing + /// Indicates the result of enqueueing. ResultT MResult; - // Pointer to the command failed to enqueue + /// Pointer to the command which failed to enqueue. Command *MCmd; - // Error code which is set when enqueueing fails + /// Error code which is set when enqueueing fails. cl_int MErrCode; }; -// DepDesc represents dependency between two commands +/// Dependency between two commands. struct DepDesc { DepDesc(Command *DepCommand, const Requirement *Req, AllocaCommandBase *AllocaCmd) @@ -68,20 +68,22 @@ struct DepDesc { std::tie(Rhs.MDepRequirement, Rhs.MDepCommand); } - // The actual dependency command. + /// The actual dependency command. Command *MDepCommand = nullptr; - // Requirement for the dependency. + /// Requirement for the dependency. const Requirement *MDepRequirement = nullptr; - // Allocation command for the memory object we have requirement for. - // Used to simplify searching for memory handle. + /// Allocation command for the memory object we have requirement for. + /// Used to simplify searching for memory handle. AllocaCommandBase *MAllocaCmd = nullptr; }; -// The Command represents some action that needs to be performed on one or -// more memory objects. The command has vector of Depdesc objects that -// represent dependencies of the command. It has vector of pointer to commands -// that depend on the command. It has pointer to sycl::queue object. And has -// event that is associated with the command. +/// The Command class represents some action that needs to be performed on one +/// or more memory objects. The Command has a vector of DepDesc objects that +/// represent dependencies of the command. It has a vector of pointers to +/// commands that depend on the command. It has a pointer to a \ref queue object +/// and an event that is associated with the command. +/// +/// \ingroup sycl_graph class Command { public: enum CommandType { @@ -104,13 +106,15 @@ class Command { void addUser(Command *NewUser) { MUsers.insert(NewUser); } - // Return type of the command, e.g. Allocate, MemoryCopy. + /// \return type of the command, e.g. Allocate, MemoryCopy. CommandType getType() const { return MType; } - // The method checks if the command is enqueued, waits for it to be - // unblocked if "Blocking" argument is true, then calls enqueueImp. Returns - // true if the command is enqueued. Sets EnqueueResult to the specific - // status otherwise. + /// Checks if the command is enqueued, and calls enqueueImp. + /// + /// \param EnqueueResult is set to the specific status if enqueue failed. + /// \param Blocking if this argument is true, function will wait for the + /// command to be unblocked before calling enqueueImp. + /// \return true if the command is enqueued. bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking); bool isFinished(); @@ -124,34 +128,33 @@ class Command { std::shared_ptr getEvent() const { return MEvent; } // Methods needed to support SYCL instrumentation - // - // Proxy method which calls emitInstrumentationData. + + /// Proxy method which calls emitInstrumentationData. void emitInstrumentationDataProxy(); - // Instrumentation method which emits telemetry data. + /// Instrumentation method which emits telemetry data. virtual void emitInstrumentationData() = 0; - // This function looks at all the dependencies for - // the release command and enables instrumentation - // to report these dependencies as edges + /// Looks at all the dependencies for the release command and enables + /// instrumentation to report these dependencies as edges. void resolveReleaseDependencies(std::set &list); - // Creates an edge event when the dependency is a command + /// Creates an edge event when the dependency is a command. void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, const string_class &Prefix, bool IsCommand); - // Creates an edge event when the dependency is an event + /// Creates an edge event when the dependency is an event. void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr); - // Creates a signal event with the enqueued kernel event handle + /// Creates a signal event with the enqueued kernel event handle. void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr); /// Create a trace event of node_create type; this must be guarded by a - /// check for xptiTraceEnabled() - /// Post Condition: MTraceEvent will be set to the event created - /// @param MAddress The address to use to create the payload + /// check for xptiTraceEnabled(). + /// Post Condition: MTraceEvent will be set to the event created. + /// \param MAddress The address to use to create the payload. uint64_t makeTraceEventProlog(void *MAddress); - // If prolog has been run, run epilog; this must be guarded by a check for - // xptiTraceEnabled() + /// If prolog has been run, run epilog; this must be guarded by a check for + /// xptiTraceEnabled(). void makeTraceEventEpilog(); - // Emits an event of Type + /// Emits an event of Type. void emitInstrumentation(uint16_t Type, const char *Txt = nullptr); - // + // End Methods needed to support SYCL instrumentation virtual void printDot(std::ostream &Stream) const = 0; @@ -172,58 +175,59 @@ class Command { RT::PiEvent &Event); std::vector prepareEvents(ContextImplPtr Context); - // Private interface. Derived classes should implement this method. + /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; - // The type of the command + /// The type of the command. CommandType MType; - // Mutex used to protect enqueueing from race conditions + /// Mutex used to protect enqueueing from race conditions std::mutex MEnqueueMtx; public: - // Contains list of dependencies(edges) + /// Contains list of dependencies(edges) std::vector MDeps; - // Contains list of commands that depend on the command + /// Contains list of commands that depend on the command. std::unordered_set MUsers; - // Indicates whether the command can be blocked from enqueueing + /// Indicates whether the command can be blocked from enqueueing. bool MIsBlockable = false; - // Counts the number of memory objects this command is a leaf for + /// Counts the number of memory objects this command is a leaf for. unsigned MLeafCounter = 0; const char *MBlockReason = "Unknown"; - // Describes the status of a command + /// Describes the status of the command. std::atomic MEnqueueStatus; // All member variable defined here are needed for the SYCL instrumentation // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION // to ensure we have the same object layout when the macro in the library and // SYCL app are not the same. - // - // The event for node_create and task_begin + + /// The event for node_create and task_begin. void *MTraceEvent = nullptr; - // The stream under which the traces are emitted; stream ids are - // positive integers and we set it to an invalid value + /// The stream under which the traces are emitted. + /// + /// Stream ids are positive integers and we set it to an invalid value. int32_t MStreamID = -1; - // Reserved for storing the object address such as SPIRV or memory object - // address + /// Reserved for storing the object address such as SPIRV or memory object + /// address. void *MAddress = nullptr; - // Buffer to build the address string + /// Buffer to build the address string. string_class MAddressString; - // Buffer to build the command node type + /// Buffer to build the command node type. string_class MCommandNodeType; - // Buffer to build the command end-user understandable name + /// Buffer to build the command end-user understandable name. string_class MCommandName; - // Flag to indicate if makeTraceEventProlog() has been run + /// Flag to indicate if makeTraceEventProlog() has been run. bool MTraceEventPrologComplete = false; - // Flag to indicate if this is the first time we are seeing this payload + /// Flag to indicate if this is the first time we are seeing this payload. bool MFirstInstance = false; - // Instance ID tracked for the command + /// Instance ID tracked for the command. uint64_t MInstanceID = 0; }; -// The command does nothing during enqueue. The task can be used to implement -// lock in the graph, or to merge several nodes into one. +/// The empty command does nothing during enqueue. The task can be used to +/// implement lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: EmptyCommand(QueueImplPtr Queue, Requirement Req); @@ -239,8 +243,8 @@ class EmptyCommand : public Command { Requirement MRequirement; }; -// The command enqueues release instance of memory allocated on Host or -// underlying framework. +/// The release command enqueues release of a memory object instance allocated +/// on Host or underlying framework. class ReleaseCommand : public Command { public: ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd); @@ -251,10 +255,11 @@ class ReleaseCommand : public Command { private: cl_int enqueueImp() final; - // Command which allocates memory release command should dealocate + /// Command which allocates memory release command should dealocate. AllocaCommandBase *MAllocaCmd = nullptr; }; +/// Base class for memory allocation commands. class AllocaCommandBase : public Command { public: AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, @@ -272,17 +277,17 @@ class AllocaCommandBase : public Command { void *MMemAllocation = nullptr; - // Alloca command linked with current command. - // Device and host alloca commands can be linked, so they may share the same - // memory. Only one allocation from a pair can be accessed at a time. Alloca - // commands associated with such allocation is "active". In order to switch - // "active" status between alloca commands map/unmap operations are used. + /// Alloca command linked with current command. + /// Device and host alloca commands can be linked, so they may share the same + /// memory. Only one allocation from a pair can be accessed at a time. Alloca + /// commands associated with such allocation is "active". In order to switch + /// "active" status between alloca commands map/unmap operations are used. AllocaCommandBase *MLinkedAllocaCmd = nullptr; - // Indicates that current alloca is active one. + /// Indicates that current alloca is active one. bool MIsActive = true; - // Indicates that the command owns memory allocation in case of connected - // alloca command + /// Indicates that the command owns memory allocation in case of connected + /// alloca command. bool MIsLeaderAlloca = true; protected: @@ -290,8 +295,8 @@ class AllocaCommandBase : public Command { ReleaseCommand MReleaseCmd; }; -// The command enqueues allocation of instance of memory object on Host or -// underlying framework. +/// The alloca command enqueues allocation of instance of memory object on Host +/// or underlying framework. class AllocaCommand : public AllocaCommandBase { public: AllocaCommand(QueueImplPtr Queue, Requirement Req, @@ -305,11 +310,12 @@ class AllocaCommand : public AllocaCommandBase { private: cl_int enqueueImp() final; - // The flag indicates that alloca should try to reuse pointer provided by - // the user during memory object construction + /// The flag indicates that alloca should try to reuse pointer provided by + /// the user during memory object construction. bool MInitFromUserData = false; }; +/// The AllocaSubBuf command enqueues creation of sub-buffer of memory object. class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, @@ -326,6 +332,7 @@ class AllocaSubBufCommand : public AllocaCommandBase { AllocaCommandBase *MParentAlloca = nullptr; }; +/// The map command enqueues mapping of device memory onto host memory. class MapMemObject : public Command { public: MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, @@ -344,6 +351,7 @@ class MapMemObject : public Command { access::mode MMapMode; }; +/// The unmap command removes mapping of host memory onto device memory. class UnMapMemObject : public Command { public: UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, @@ -361,7 +369,8 @@ class UnMapMemObject : public Command { void **MSrcPtr = nullptr; }; -// The command enqueues memory copy between two instances of memory object. +/// The mem copy command enqueues memory copy between two instances of memory +/// object. class MemCpyCommand : public Command { public: MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, @@ -382,7 +391,8 @@ class MemCpyCommand : public Command { AllocaCommandBase *MDstAllocaCmd = nullptr; }; -// The command enqueues memory copy between two instances of memory object. +/// The mem copy host command enqueues memory copy between two instances of +/// memory object. class MemCpyCommandHost : public Command { public: MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, @@ -403,7 +413,8 @@ class MemCpyCommandHost : public Command { void **MDstPtr = nullptr; }; -// The command enqueues execution of kernel or explicit memory operation. +/// The exec CG command enqueues execution of kernel or explicit memory +/// operation. class ExecCGCommand : public Command { public: ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index c6934cebf9e00..4d749d92e9952 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -27,9 +27,10 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -// The function checks whether two requirements overlaps or not. This -// information can be used to prove that executing two kernels that -// work on different parts of the memory object in parallel is legal. +/// Checks whether two requirements overlap or not. +/// +/// This information can be used to prove that executing two kernels that +/// work on different parts of the memory object in parallel is legal. static bool doOverlap(const Requirement *LHS, const Requirement *RHS) { return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >= RHS->MOffsetInBytes) || @@ -43,12 +44,12 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { return LHS == RHS || (LHS->is_host() && RHS->is_host()); } -// The function checks if current requirement is requirement for sub buffer +/// Checks if current requirement is requirement for sub buffer. static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } -// Checks if the required access mode is allowed under the current one +/// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { case access::mode::read: @@ -104,7 +105,6 @@ static void printDotRecursive(std::fstream &Stream, void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) { static size_t Counter = 0; - std::string ModeNameStr(ModeName); std::string FileName = "graph_" + std::to_string(Counter) + ModeNameStr + ".dot"; @@ -123,13 +123,10 @@ void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) { Stream << "}" << std::endl; } -// Returns record for the memory objects passed, nullptr if doesn't exist. MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) { return MemObject->MRecord.get(); } -// Returns record for the memory object requirement refers to, if doesn't -// exist, creates new one. MemObjRecord * Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, Requirement *Req) { @@ -147,7 +144,6 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, return MemObject->MRecord.get(); } -// Helper function which removes all values in Cmds from Leaves void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode) { @@ -398,7 +394,7 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) { // The function implements SYCL host accessor logic: host accessor // should provide access to the buffer in user space. Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, - const bool destructor) { + const bool destructor) { const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue(); @@ -450,15 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( return insertMemoryMove(Record, Req, HostQueue); } -// The functions finds dependencies for the requirement. It starts searching -// from list of "leaf" commands for the record and check if the examining -// command can be executed in parallel with new one with regard to the memory -// object. If can, then continue searching through dependencies of that -// command. There are several rules used: -// -// 1. New and examined commands only read -> can bypass -// 2. New and examined commands has non-overlapping requirements -> can bypass -// 3. New and examined commands has different contexts -> cannot bypass +/// Start the search for the record from list of "leaf" commands and check if +/// the examined command can be executed in parallel with the new one with +/// regard to the memory object. If it can, then continue searching through +/// dependencies of that command. There are several rules used: +/// +/// 1. New and examined commands only read -> can bypass +/// 2. New and examined commands has non-overlapping requirements -> can bypass +/// 3. New and examined commands have different contexts -> cannot bypass std::set Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) { @@ -607,8 +602,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // To ensure that the leader allocation is removed first AllocaCmd->getReleaseCmd()->addDep( - DepDesc(LinkedAllocaCmd->getReleaseCmd(), AllocaCmd->getRequirement(), - LinkedAllocaCmd)); + DepDesc(LinkedAllocaCmd->getReleaseCmd(), + AllocaCmd->getRequirement(), LinkedAllocaCmd)); // Device allocation takes ownership of the host ptr during // construction, host allocation doesn't. So, device allocation should diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 19c79ea5e4e18..5affe5bde4977 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -19,6 +19,152 @@ #include #include +/// \defgroup sycl_graph DPC++ Execution Graph +/// +/// SYCL, unlike OpenCL, provides a programming model in which the user doesn't +/// need to manage dependencies between kernels and memory explicitly. The DPC++ +/// Runtime must ensure correct execution with respect to the order commands are +/// submitted. +/// +/// This document describes the part of the DPC++ Runtime that is responsible +/// for building and processing dependency graph. +/// +/// ## A couple of words about DPC++ and SYCL execution and memory model +/// +/// The SYCL framework defines command group (\ref CG) as an entity that +/// represents minimal execution block. The command group is submitted to SYCL +/// queue and consists of a kernel or an explicit memory operation, and their +/// requirements. The SYCL queue defines the device and context using which the +/// kernel should be executed. +/// +/// The commands that contain explicit memory operations include copy, fill, +/// update_host and other operations. It's up to implementation how to define +/// these operations. +/// +/// The relative order of command groups submission defines the order in which +/// kernels must be executed if their memory requirements intersect. For +/// example, if a command group A writes to a buffer X, command group B reads +/// from X, then the scheduled execution order of A and B will be the same as +/// their dynamic submission order (matches program order if submitted from the +/// same host thread). +/// +/// Memory requirements are requests to SYCL memory objects, such as buffer and +/// image. SYCL memory objects are not bound to any specific context or device, +/// it's SYCL responsibility to allocate and/or copy memory to the target +/// context to achieve correct execution. +/// +/// Refer to SYCL Specification 1.2.1 sections 3.4 and 3.5 to find more +/// information about SYCL execution and memory model. +/// +/// ### Example of DPC++ application +/// +/// \code{.cpp} +/// { +/// // Creating SYCL CPU and GPU queues +/// cl::sycl::queue CPU_Queue = ...; +/// cl::sycl::queue GPU_Queue = ...; +/// +/// // Creating 3 SYCL buffers +/// auto BufferA = ...; // Buffer is initialized with host memory. +/// auto BufferB = ...; +/// auto BufferC = ...; +/// +/// // "Copy command group" section +/// // Request processing explicit copy operation on CPU +/// // The copy operation reads from BufferA and writes to BufferB +/// +/// CPU_Queue.submit([&](handler &CGH) { +/// auto A = BufferA.get_access(CGH); +/// auto B = BufferB.get_access(CGH); +/// CGH.copy(A, B); +/// }); +/// +/// // "Multi command group" section +/// // Request processing multi kernel on GPU +/// // The kernel reads from BufferB, multiplies by 4 and writes result to +/// // BufferC +/// +/// GPU_Queue.submit([&](handler &CGH) { +/// auto B = BufferB.get_access(CGH); +/// auto C = BufferC.get_access(CGH); +/// CGH.parallel_for(range<1>{N}, [=](id<1> Index) { +/// C[Index] = B[Index] * 4; +/// }); +/// }); +/// +/// // "Host accessor creation" section +/// // Request the latest data of BufferC for the moment +/// // This is a synchronization point, which means that the DPC++ RT blocks +/// // on creation of the accessor until requested data is available. +/// auto C = BufferC.get_access(); +/// } +/// \endcode +/// +/// In the example above the DPC++ RT does the following: +/// +/// 1. **Copy command group**. +/// The DPC++ RT allocates memory for BufferA and BufferB on CPU then +/// executes an explicit copy operation on CPU. +/// +/// 2. **Multi command group** +/// DPC++ RT allocates memory for BufferC and BufferB on GPU and copy +/// content of BufferB from CPU to GPU, then execute "multi" kernel on GPU. +/// +/// 3. **Host accessor creation** +/// DPC++ RT allocates(it's possible to reuse already allocated memory) +/// memory available for user for BufferC then copy content of BufferC from +/// GPU to this memory. +/// +/// So, the example above will be converted to the following OpenCL pseudo code +/// \code{.cpp} +/// // Initialization(not related to the Scheduler) +/// Platform = clGetPlatforms(...); +/// DeviceCPU = clGetDevices(CL_DEVICE_TYPE_CPU, ...); +/// DeviceGPU = clGetDevices(CL_DEVICE_TYPE_GPU, ...); +/// ContextCPU = clCreateContext(DeviceCPU, ...) +/// ContextGPU = clCreateContext(DeviceGPU, ...) +/// QueueCPU = clCreateCommandQueue(ContextCPU, DeviceCPU, ...); +/// QueueGPU = clCreateCommandQueue(ContextGPU, DeviceGPU, ...); +/// +/// // Copy command group: +/// BufferACPU = clCreateBuffer(ContextCPU, CL_MEM_USE_HOST_PTR, ...); +/// BufferBCPU = clCreateBuffer(ContextCPU, ...); +/// CopyEvent = clEnqueueCopyBuffer(QueueCPU, BufferACPU, BufferBCPU, ...) +/// +/// // Multi command group: +/// ReadBufferEvent = +/// clEnqueueReadBuffer(QueueCPU, BufferBCPU, HostPtr, CopyEvent, ...); +/// BufferBGPU = clCreateBuffer(ContextGPU, ...); +/// +/// UserEvent = clCreateUserEvent(ContextCPU); +/// clSetEventCallback(ReadBufferEvent, event_completion_callback, +/// /*data=*/UserEvent); +/// +/// WriteBufferEvent = clEnqueueWriteBuffer(QueueGPU, BufferBGPU, HostPtr, +/// UserEvent, ...); BufferCGPU = clCreateBuffer(ContextGPU, ...); ProgramGPU = +/// clCreateProgramWithIL(ContextGPU, ...); clBuildProgram(ProgramGPU); +/// MultiKernel = clCreateKernel("multi"); +/// clSetKernelArg(MultiKernel, BufferBGPU, ...); +/// clSetKernelArg(MultiKernel, BufferCGPU, ...); +/// MultiEvent = +/// clEnqueueNDRangeKernel(QueueGPU, MultiKernel, WriteBufferEvent, ...); +/// +/// // Host accessor creation: +/// clEnqueueMapBuffer(QueueGPU, BufferCGPU, BLOCKING_MAP, MultiEvent, ...); +/// +/// // Releasing mem objects during SYCL buffers destruction. +/// clReleaseBuffer(BufferACPU); +/// clReleaseBuffer(BufferBCPU); +/// clReleaseBuffer(BufferBGPU); +/// clReleaseBuffer(BufferCGPU); +/// +/// // Release(not related to the Scheduler) +/// clReleaseKernel(MultiKernel); +/// clReleaseProgram(ProgramGPU); +/// clReleaseContext(ContextGPU); +/// clReleaseContext(ContextCPU); +/// \endcode + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { @@ -31,8 +177,12 @@ using QueueImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; using ContextImplPtr = std::shared_ptr; -// The MemObjRecord is created for each memory object used in command -// groups. There should be only one MemObjRecord for SYCL memory object. +/// Memory Object Record +/// +/// The MemObjRecord is used in command groups (todo better desc). +/// There must be a single MemObjRecord for each SYCL memory object. +/// +/// \ingroup sycl_graph struct MemObjRecord { MemObjRecord(ContextImplPtr CurContext, std::size_t LeafLimit) : MReadLeaves{LeafLimit}, MWriteLeaves{LeafLimit}, MCurContext{ @@ -59,46 +209,217 @@ struct MemObjRecord { bool MMemModified = false; }; +/// DPC++ graph scheduler class. +/// +/// \section sched_overview Scheduler Overview +/// +/// The Scheduler is a part of DPC++ RT which ensures correct execution of +/// command groups. To achieve this Scheduler manages acyclic dependency graph +/// (which can have independent sub-graphs) that consists of several types of +/// nodes that represent specific commands: + +/// 1. Allocate memory. +/// The command represents memory allocation operation. There can be +/// multiple allocations for a single SYCL memory object. +/// 2. Release memory. +/// The command represents memory release operation. +/// 3. Execute command group. +/// The command represents \ref CG "Command Group" (kernel) execution +/// operation. +/// 4. Copy memory. +/// The command represents memory copy operation between two memory +/// allocations of a single memory object. +/// +/// As the main input Scheduler takes a command group and returns an event +/// representing it, so it can be waited on later. When a new +/// command group comes, Scheduler adds one or more nodes to the graph +/// depending on the command groups' requirements. For example, if a new +/// command group is submitted to the SYCL context which has the latest data +/// for all the requirements, Scheduler adds a new "Execute command group" +/// command making it dependent on all commands affecting new command group's +/// requirements. But if one of the requirements has no up-to-date instance in +/// the context which the command group is submitted to, Scheduler +/// additionally inserts copy memory command (together with allocate memory +/// command if needed). +/// +/// A simple graph looks like: +// +// +----------+ +----------+ +----------+ +// | | | | | | +// | Allocate |<----| Execute |<----| Execute | +// | | | | | | +// +----------+ +----------+ +----------+ +// +/// \dot +/// digraph G { +/// rankdir="LR"; +/// Execute1 [label = "Execute"]; +/// Execute2 [label = "Execute"]; +/// Allocate; +/// Allocate -> Execute2 [dir = back]; +/// Execute2 -> Execute1 [dir = back]; +/// } +/// \enddot +/// +/// Where nodes represent commands and edges represent dependencies between +/// them. There are three commands connected by arrows which mean that before +/// executing second command group the first one must be executed. Also before +/// executing the first command group memory allocation must be performed. +/// +/// At some point Scheduler enqueues commands to the underlying devices. To do +/// this, Scheduler performs topological sort to get the order in which commands +/// should be enqueued. For example, the following graph (D depends on B and C, +/// B and C depends on A) will be enqueued in the following order: +/// \code{.cpp} +/// EventA = Enqueue(A, /*Deps=*/{}); +/// EventB = Enqueue(B, /*Deps=*/{EventA}); +/// EventC = Enqueue(C, /*Deps=*/{EventA}); +/// EventD = Enqueue(D, /*Deps=*/{EventB, EventC}); +/// \endcode +/// +// +----------+ +// | | +// | D | +// | | +// +----------+ +// / \ +// / \ +// v v +// +----------+ +----------+ +// | | | | +// | B | | C | +// | | | | +// +----------+ +----------+ +// \ / +// \ / +// v v +// +----------+ +// | | +// | A | +// | | +// +----------+ +/// \dot +/// digraph G { +/// D -> B; +/// D -> C; +/// C -> A; +/// B -> A; +/// } +/// \enddot +/// +/// \section sched_impl Implementation details +/// +/// The Scheduler is split up into two parts: graph builder and graph processor. +/// +/// To build dependencies, Scheduler needs to memorize memory objects and +/// commands that modify them. +/// +/// To detect that two command groups access the same memory object and create +/// a dependency between them, Scheduler needs to store information about +/// the memory object. +/// +/// \subsection sched_thread_safety Thread safety +/// +/// To ensure thread safe execution of methods, Scheduler provides access to the +/// graph that's guarded by a read-write mutex (analog of shared mutex from +/// C++17). +/// +/// A read-write mutex allows concurrent access to read-only operations, while +/// write operations require exclusive access. +/// +/// All the methods of GraphBuilder lock the mutex in write mode because these +/// methods can modify the graph. +/// Methods of GraphProcessor lock the mutex in read mode as they are not +/// modifying the graph. +/// +/// \subsection shced_err_handling Error handling +/// +/// There are two sources of errors that needs to be handled in Scheduler: +/// 1. errors that happen during command enqueue process +/// 2. the error that happend during command execution. +/// +/// If an error occurs during command enqueue process, the Command::enqueue +/// method returns the faulty command. Scheduler then reschedules the command +/// and all dependent commands (if any). +/// +/// An error with command processing can happen in underlying runtime, in this +/// case Scheduler is notified asynchronously (using callback mechanism) what +/// triggers rescheduling. +/// +/// \ingroup sycl_graph class Scheduler { public: - // Registers command group, adds it to the dependency graph and returns an - // event object that can be used for waiting later. It's called by SYCL's - // queue.submit. + /// Registers a command group, and adds it to the dependency graph. + /// + /// It's called by SYCL's queue.submit. + /// + /// \param CommandGroup is a unique_ptr to a command group to be added. + /// \return an event object to wait on for command group completion. EventImplPtr addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue); + /// Registers a command group, that copies most recent memory to the memory + /// pointed by the requirement. + /// + /// \param Req is a requirement that points to the memory where data is + /// needed. + /// \return an event object to wait on for copy finish. EventImplPtr addCopyBack(Requirement *Req); - // Blocking call that waits for the event passed. For the eager execution - // mode this method invokes corresponding function of device API. In the - // lazy execution mode the method may enqueue the command associated with - // the event passed and its dependency before calling device API. + /// Waits for the event. + /// + /// This operation is blocking. For eager execution mode this method invokes + /// corresponding function of device API. + /// + /// \param Event is a pointer to event to wait on. void waitForEvent(EventImplPtr Event); - // Removes buffer pointed by MemObj from the graph: ensures all commands - // accessing the memory objects are executed and triggers deallocation of - // all memory assigned to the memory object. It's called from the - // sycl::buffer and sycl::image destructors. + /// Removes buffer from the graph. + /// + /// The lifetime of memory object descriptor begins when the first command + /// group that uses the memory object is submitted and ends when + /// "removeMemoryObject(...)" method is called which means there will be no + /// command group that uses the memory object. When removeMemoryObject is + /// called Scheduler will enqueue and wait on all release commands associated + /// with the memory object, which effectively guarantees that all commands + /// accessing the memory object are complete and then the resources allocated + /// for the memory object are freed. Then all the commands affecting the + /// memory object are removed. + /// + /// This member function is used by \ref buffer and \ref image. + /// + /// \param MemObj is a memory object that points to the buffer being removed. void removeMemoryObject(detail::SYCLMemObjI *MemObj); - // Removes finished non-leaf non-alloca commands from the subgraph (assuming - // that all its commands have been waited for). + /// Removes finished non-leaf non-alloca commands from the subgraph (assuming + /// that all its commands have been waited for). + /// \sa GraphBuilder::cleanupFinishedCommands + /// + /// \param FinishedEvent is a cleanup candidate event. void cleanupFinishedCommands(EventImplPtr FinishedEvent); - // Creates nodes in the graph, that update Req with the pointer to the host - // memory which contains the latest data of the memory object. New - // operations with the same memory object that have side effects are blocked - // until releaseHostAccessor is called. Returns an event which indicates - // when these nodes are completed and host accessor is ready for using. + /// Adds nodes to the graph, that update the requirement with the pointer + /// to the host memory. + /// + /// Assumes the host pointer contains the latest data. New operations with + /// the same memory object that have side effects are blocked until + /// releaseHostAccessor(Requirement *Req) is callled. + /// + /// \param Req is the requirement to be updated. + /// \return an event which indicates when these nodes are completed + /// and host accessor is ready for use. EventImplPtr addHostAccessor(Requirement *Req, const bool Destructor = false); - // Unblocks operations with the memory object. + /// Unblocks operations with the memory object. + /// + /// \param Req is a requirement that points to the memory object being + /// unblocked. void releaseHostAccessor(Requirement *Req); - // Returns an instance of the scheduler object. + /// \return an instance of the scheduler object. static Scheduler &getInstance(); - // Returns list of "immediate" dependencies for the Event given. + /// \return a vector of "immediate" dependencies for the Event given. std::vector getWaitList(EventImplPtr Event); QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; } @@ -107,71 +428,98 @@ class Scheduler { Scheduler(); static Scheduler instance; - // The graph builder provides interfaces that can change already existing - // graph (e.g. add/remove edges/nodes). + /// Graph builder class. + /// + /// The graph builder provides means to change an existing graph (e.g. add + /// or remove edges/nodes). + /// + /// \ingroup sycl_graph class GraphBuilder { public: GraphBuilder(); - // Registers command group, adds it to the dependency graph and returns an - // command that represents command group execution. It's called by SYCL's - // queue::submit. + /// Registers \ref CG "command group" and adds it to the dependency graph. + /// + /// \sa queue::submit, Scheduler::addCG + /// + /// \return a command that represents command group execution. Command *addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue); + /// Registers a \ref CG "command group" that updates host memory to the + /// latest state. + /// + /// \return a command that represents command group execution. Command *addCGUpdateHost(std::unique_ptr CommandGroup, QueueImplPtr HostQueue); + /// Enqueues a command to update memory to the latest state. + /// + /// \param Req is a requirement, that describes memory object. Command *addCopyBack(Requirement *Req); + + /// Enqueues a command to create a host accessor. + /// + /// \param Req points to memory being accessed. Command *addHostAccessor(Requirement *Req, const bool destructor = false); - // [Provisional] Optimizes the whole graph. + /// [Provisional] Optimizes the whole graph. void optimize(); - // [Provisional] Optimizes subgraph that consists of command associated - // with Event passed and its dependencies. + /// [Provisional] Optimizes subgraph that consists of command associated + /// with Event passed and its dependencies. void optimize(EventImplPtr Event); - // Removes finished non-leaf non-alloca commands from the subgraph (assuming - // that all its commands have been waited for). + /// Removes finished non-leaf non-alloca commands from the subgraph + /// (assuming that all its commands have been waited for). void cleanupFinishedCommands(Command *FinishedCmd); - // Reschedules command passed using Queue provided. this can lead to - // rescheduling of all dependent commands. This can be used when user - // provides "secondary" queue to submit method which may be used when - // command fails to enqueue/execute in primary queue. + /// Reschedules the command passed using Queue provided. + /// + /// This can lead to rescheduling of all dependent commands. This can be + /// used when the user provides a "secondary" queue to the submit method + /// which may be used when the command fails to enqueue/execute in the + /// primary queue. void rescheduleCommand(Command *Cmd, QueueImplPtr Queue); + /// \return a pointer to the corresponding memory object record for the + /// SYCL memory object provided, or nullptr if it does not exist. MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject); - // Returns pointer to MemObjRecord for pointer to memory object. - // Return nullptr if there the record is not found. + + /// \return a pointer to MemObjRecord for pointer to memory object. If the + /// record is not found, nullptr is returned. MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue, Requirement *Req); - // Decrements leaf counters for all leaves of the record. + /// Decrements leaf counters for all leaves of the record. void decrementLeafCountersForRecord(MemObjRecord *Record); - // Removes commands that use given MemObjRecord from the graph. + /// Removes commands that use the given MemObjRecord from the graph. void cleanupCommandsForRecord(MemObjRecord *Record); - // Removes MemObjRecord for memory object passed. + /// Removes the MemObjRecord for the memory object passed. void removeRecordForMemObj(SYCLMemObjI *MemObject); - // Add new command to leaves if needed. + /// Adds new command to leaves if needed. void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode); - // Removes commands from leaves. + /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode); std::vector MMemObjs; private: - // The method inserts required command to make so the latest state for the - // memory object Record refers to resides in the context which is bound to - // the Queue. Can insert copy/map/unmap operations depending on the source - // and destination. + /// Inserts the command required to update the memory object state in the + /// context. + /// + /// Copy/map/unmap operations can be inserted depending on the source and + /// destination. + /// + /// \param Record is a memory object that needs to be updated. + /// \param Req is a Requirement describing destination. + /// \param Queue is a queue that is bound to target context. Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue); @@ -184,24 +532,30 @@ class Scheduler { insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue); + /// Finds dependencies for the requirement. std::set findDepsForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context); - // Finds a command dependency corresponding to the record + /// Finds a command dependency corresponding to the record. DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record); - // Searches for suitable alloca in memory record. + /// Searches for suitable alloca in memory record. AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context); - // Searches for suitable alloca in memory record. - // If none found, creates new one. + + /// Searches for suitable alloca in memory record. + /// + /// If none found, creates new one. AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record, Requirement *Req, QueueImplPtr Queue); void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req); - // Print contents of graph to text file in DOT format + /// Prints contents of graph to text file in DOT format + /// + /// \param ModeName is a stringified printing mode name to be used + /// in the result file name. void printGraphAsDot(const char *ModeName); enum PrintOptions { BeforeAddCG = 0, @@ -215,21 +569,91 @@ class Scheduler { std::array MPrintOptionsArray; }; - // The class that provides interfaces for enqueueing command and its - // dependencies to the underlying runtime. Methods of this class must not - // modify the graph. + /// Graph Processor provides interfaces for enqueueing commands and their + /// dependencies to the underlying runtime. + /// + /// Member functions of this class do not modify the graph. + /// + /// \section sched_enqueue Command enqueueing + /// + /// Commands are enqueued whenever they come to the Scheduler. Each command + /// has enqueue method which takes vector of events that represents + /// dependencies and returns event which represents the command. + /// GraphProcessor performs topological sort to get the order in which + /// commands have to be enqueued. Then it enqueues each command, passing a + /// vector of events that this command needs to wait on. If an error happens + /// during command enqueue, the whole process is stopped, the faulty command + /// is propagated back to the Scheduler. + /// + /// The command with dependencies that belong to a context different from its + /// own can't be enqueued directly (limitation of OpenCL runtime). + /// Instead, for each dependency, a proxy event is created in the target + /// context and linked using OpenCL callback mechanism with original one. + /// For example, the following SYCL code: + /// + /// \code{.cpp} + /// // The ContextA and ContextB are different OpenCL contexts + /// sycl::queue Q1(ContextA); + /// sycl::queue Q2(ContextB); + /// + /// Q1.submit(Task1); + /// + /// Q2.submit(Task2); + /// \endcode + /// + /// is translated to the following OCL API calls: + /// + /// \code{.cpp} + /// void event_completion_callback(void *data) { + /// // Change status of event to complete. + /// clSetEventStatus((cl_event *)data, CL_COMPLETE); // Scope of Context2 + /// } + /// + /// // Enqueue TASK1 + /// EventTask1 = clEnqueueNDRangeKernel(Q1, TASK1, ..); // Scope of Context1 + /// // Read memory to host + /// ReadMem = clEnqueueReadBuffer(A, .., /*Deps=*/EventTask1); // Scope of + /// // Context1 + /// + /// // Create user event with initial status "not completed". + /// UserEvent = clCreateUserEvent(Context2); // Scope of Context2 + /// // Ask OpenCL to call callback with UserEvent as data when "read memory + /// // to host" operation is completed + /// clSetEventCallback(ReadMem, event_completion_callback, + /// /*data=*/UserEvent); // Scope of Context1 + /// + /// // Enqueue write memory from host, block it on user event + /// // It will be unblocked when we change UserEvent status to completed in + /// // callback. + /// WriteMem = + /// clEnqueueWriteBuffer(A, .., /*Dep=*/UserEvent); // Scope of Context2 + /// // Enqueue TASK2 + /// EventTask2 = + /// clEnqueueNDRangeKernel(TASK, .., /*Dep=*/WriteMem); // Scope of + /// // Context2 + /// \endcode + /// + /// The alternative approach that has been considered is to have separate + /// dispatcher thread that would wait for all events from the Context other + /// then target Context to complete and then enqueue command with dependencies + /// from target Context only. Alternative approach makes code significantly + /// more complex and can hurt performance on CPU device vs chosen approach + /// with callbacks. + /// + /// \ingroup sycl_graph class GraphProcessor { public: - // Returns a list of events that represent immediate dependencies of the - // command associated with Event passed. + /// \return a list of events that represent immediate dependencies of the + /// command associated with Event passed. static std::vector getWaitList(EventImplPtr Event); - // Wait for the command, associated with Event passed, is completed. + /// Waits for the command, associated with Event passed, is completed. static void waitForEvent(EventImplPtr Event); - // Enqueue the command passed and all it's dependencies to the underlying - // device. Returns true is the command is successfully enqueued. Sets - // EnqueueResult to the specific status otherwise. + /// Enqueues the command and all its dependencies. + /// + /// \param EnqueueResult is set to specific status if enqueue failed. + /// \return true if the command is successfully enqueued. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, BlockingT Blocking = NON_BLOCKING); }; @@ -237,7 +661,7 @@ class Scheduler { void waitForRecordToFinish(MemObjRecord *Record); GraphBuilder MGraphBuilder; - // Use read-write mutex in future. + // TODO Use read-write mutex in future. std::mutex MGraphLock; QueueImplPtr DefaultHostQueue;