diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index 8dbe84d10e209..fe7868204cf30 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -43,6 +43,7 @@ class interop_handler {
   template <typename DataT, int Dims, access::mode AccMode,
             access::target AccTarget, access::placeholder isPlaceholder>
   friend class accessor;
+
 public:
   using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
   using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
@@ -307,8 +308,7 @@ class HostKernel : public HostKernelBase {
   template <class ArgT = KernelArgType>
   typename std::enable_if<std::is_same<ArgT, nd_item<Dims>>::value>::type
   runOnHost(const NDRDescT &NDRDesc) {
-    sycl::range<Dims> GroupSize(
-        InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> GroupSize(InitializedVal<Dims, range>::template get<0>());
     for (int I = 0; I < Dims; ++I) {
       if (NDRDesc.LocalSize[I] == 0 ||
           NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
@@ -317,8 +317,7 @@ class HostKernel : public HostKernelBase {
       GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
     }
 
-    sycl::range<Dims> LocalSize(
-        InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
     sycl::range<Dims> GlobalSize(
         InitializedVal<Dims, range>::template get<0>());
     sycl::id<Dims> GlobalOffset;
@@ -359,10 +358,9 @@ class HostKernel : public HostKernelBase {
       NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
     }
 
-    sycl::range<Dims> LocalSize(
-      InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
     sycl::range<Dims> GlobalSize(
-      InitializedVal<Dims, range>::template get<0>());
+        InitializedVal<Dims, range>::template get<0>());
     for (int I = 0; I < Dims; ++I) {
       LocalSize[I] = NDRDesc.LocalSize[I];
       GlobalSize[I] = NDRDesc.GlobalSize[I];
@@ -378,10 +376,10 @@ class HostKernel : public HostKernelBase {
 };
 
 class stream_impl;
-// The base class for all types of command groups.
+/// Base class for all types of command groups.
 class CG {
 public:
-  // Type of the command group.
+  /// Type of the command group.
   enum CGTYPE {
     NONE,
     KERNEL,
@@ -425,20 +423,20 @@ class CG {
 
 private:
   CGTYPE MType;
-  // The following storages needed to ensure that arguments won't die while
+  // The following storages are needed to ensure that arguments won't die while
   // we are using them.
-  // Storage for standard layout arguments.
+  /// Storage for standard layout arguments.
   vector_class<vector_class<char>> MArgsStorage;
-  // Storage for accessors.
+  /// Storage for accessors.
   vector_class<detail::AccessorImplPtr> MAccStorage;
-  // Storage for shared_ptrs.
+  /// Storage for shared_ptrs.
   vector_class<shared_ptr_class<const void>> MSharedPtrStorage;
 
 public:
-  // List of requirements that specify which memory is needed for the command
-  // group to be executed.
+  /// List of requirements that specify which memory is needed for the command
+  /// group to be executed.
   vector_class<Requirement *> MRequirements;
-  // List of events that order the execution of this CG
+  /// List of events that order the execution of this CG
   vector_class<detail::EventImplPtr> MEvents;
   // Member variables to capture the user code-location
   // information from Q.submit(), Q.parallel_for() etc
@@ -448,9 +446,10 @@ class CG {
   int32_t MLine, MColumn;
 };
 
-// The class which represents "execute kernel" command group.
+/// "Execute kernel" command group class.
 class CGExecKernel : public CG {
 public:
+  /// Stores ND-range description.
   NDRDescT MNDRDesc;
   unique_ptr_class<HostKernelBase> MHostKernel;
   shared_ptr_class<detail::kernel_impl> MSyclKernel;
@@ -488,7 +487,7 @@ class CGExecKernel : public CG {
   }
 };
 
-// The class which represents "copy" command group.
+/// "Copy memory" command group class.
 class CGCopy : public CG {
   void *MSrc;
   void *MDst;
@@ -509,7 +508,7 @@ class CGCopy : public CG {
   void *getDst() { return MDst; }
 };
 
-// The class which represents "fill" command group.
+/// "Fill memory" command group class.
 class CGFill : public CG {
 public:
   vector_class<char> MPattern;
@@ -529,7 +528,7 @@ class CGFill : public CG {
   Requirement *getReqToFill() { return MPtr; }
 };
 
-// The class which represents "update host" command group.
+/// "Update host" command group class.
 class CGUpdateHost : public CG {
   Requirement *MPtr;
 
@@ -548,7 +547,7 @@ class CGUpdateHost : public CG {
   Requirement *getReqToUpdate() { return MPtr; }
 };
 
-// The class which represents "copy" command group for USM pointers.
+/// "Copy USM" command group class.
 class CGCopyUSM : public CG {
   void *MSrc;
   void *MDst;
@@ -572,7 +571,7 @@ class CGCopyUSM : public CG {
   size_t getLength() { return MLength; }
 };
 
-// The class which represents "fill" command group for USM pointers.
+/// "Fill USM" command group class.
 class CGFillUSM : public CG {
   vector_class<char> MPattern;
   void *MDst;
@@ -595,7 +594,7 @@ class CGFillUSM : public CG {
   int getFill() { return MPattern[0]; }
 };
 
-// The class which represents "prefetch" command group for USM pointers.
+/// "Prefetch USM" command group class.
 class CGPrefetchUSM : public CG {
   void *MDst;
   size_t MLength;
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 8cbe2ccf8ff98..80feb92ce9560 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -38,7 +38,7 @@ class ReleaseCommand;
 
 enum BlockingT { NON_BLOCKING = 0, BLOCKING };
 
-// The struct represents the result of command enqueueing
+/// Result of command enqueueing.
 struct EnqueueResultT {
   enum ResultT {
     SyclEnqueueReady,
@@ -49,15 +49,15 @@ struct EnqueueResultT {
   EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr,
                  cl_int ErrCode = CL_SUCCESS)
       : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {}
-  // Indicates result of enqueueing
+  /// Indicates the result of enqueueing.
   ResultT MResult;
-  // Pointer to the command failed to enqueue
+  /// Pointer to the command which failed to enqueue.
   Command *MCmd;
-  // Error code which is set when enqueueing fails
+  /// Error code which is set when enqueueing fails.
   cl_int MErrCode;
 };
 
-// DepDesc represents dependency between two commands
+/// Dependency between two commands.
 struct DepDesc {
   DepDesc(Command *DepCommand, const Requirement *Req,
           AllocaCommandBase *AllocaCmd)
@@ -68,20 +68,22 @@ struct DepDesc {
            std::tie(Rhs.MDepRequirement, Rhs.MDepCommand);
   }
 
-  // The actual dependency command.
+  /// The actual dependency command.
   Command *MDepCommand = nullptr;
-  // Requirement for the dependency.
+  /// Requirement for the dependency.
   const Requirement *MDepRequirement = nullptr;
-  // Allocation command for the memory object we have requirement for.
-  // Used to simplify searching for memory handle.
+  /// Allocation command for the memory object we have requirement for.
+  /// Used to simplify searching for memory handle.
   AllocaCommandBase *MAllocaCmd = nullptr;
 };
 
-// The Command represents some action that needs to be performed on one or
-// more memory objects. The command has vector of Depdesc objects that
-// represent dependencies of the command. It has vector of pointer to commands
-// that depend on the command. It has pointer to sycl::queue object. And has
-// event that is associated with the command.
+/// The Command class represents some action that needs to be performed on one
+/// or more memory objects. The Command has a vector of DepDesc objects that
+/// represent dependencies of the command. It has a vector of pointers to
+/// commands that depend on the command. It has a pointer to a \ref queue object
+/// and an event that is associated with the command.
+///
+/// \ingroup sycl_graph
 class Command {
 public:
   enum CommandType {
@@ -104,13 +106,15 @@ class Command {
 
   void addUser(Command *NewUser) { MUsers.insert(NewUser); }
 
-  // Return type of the command, e.g. Allocate, MemoryCopy.
+  /// \return type of the command, e.g. Allocate, MemoryCopy.
   CommandType getType() const { return MType; }
 
-  // The method checks if the command is enqueued, waits for it to be
-  // unblocked if "Blocking" argument is true, then calls enqueueImp. Returns
-  // true if the command is enqueued. Sets EnqueueResult to the specific
-  // status otherwise.
+  /// Checks if the command is enqueued, and calls enqueueImp.
+  ///
+  /// \param EnqueueResult is set to the specific status if enqueue failed.
+  /// \param Blocking if this argument is true, function will wait for the
+  ///        command to be unblocked before calling enqueueImp.
+  /// \return true if the command is enqueued.
   bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking);
 
   bool isFinished();
@@ -124,34 +128,33 @@ class Command {
   std::shared_ptr<event_impl> getEvent() const { return MEvent; }
 
   // Methods needed to support SYCL instrumentation
-  //
-  // Proxy method which calls emitInstrumentationData.
+
+  /// Proxy method which calls emitInstrumentationData.
   void emitInstrumentationDataProxy();
-  // Instrumentation method which emits telemetry data.
+  /// Instrumentation method which emits telemetry data.
   virtual void emitInstrumentationData() = 0;
-  // This function looks at all the dependencies for
-  // the release command and enables instrumentation
-  // to report these dependencies as edges
+  /// Looks at all the dependencies for the release command and enables
+  /// instrumentation to report these dependencies as edges.
   void resolveReleaseDependencies(std::set<Command *> &list);
-  // Creates an edge event when the dependency is a command
+  /// Creates an edge event when the dependency is a command.
   void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr,
                                          const string_class &Prefix,
                                          bool IsCommand);
-  // Creates an edge event when the dependency is an event
+  /// Creates an edge event when the dependency is an event.
   void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr);
-  // Creates a signal event with the enqueued kernel event handle
+  /// Creates a signal event with the enqueued kernel event handle.
   void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr);
   /// Create a trace event of node_create type; this must be guarded by a
-  /// check for xptiTraceEnabled()
-  /// Post Condition: MTraceEvent will be set to the event created
-  /// @param MAddress  The address to use to create the payload
+  /// check for xptiTraceEnabled().
+  /// Post Condition: MTraceEvent will be set to the event created.
+  /// \param MAddress  The address to use to create the payload.
   uint64_t makeTraceEventProlog(void *MAddress);
-  // If prolog has been run, run epilog; this must be guarded by a check for
-  // xptiTraceEnabled()
+  /// If prolog has been run, run epilog; this must be guarded by a check for
+  /// xptiTraceEnabled().
   void makeTraceEventEpilog();
-  // Emits an event of Type
+  /// Emits an event of Type.
   void emitInstrumentation(uint16_t Type, const char *Txt = nullptr);
-  //
+
   // End Methods needed to support SYCL instrumentation
 
   virtual void printDot(std::ostream &Stream) const = 0;
@@ -172,58 +175,59 @@ class Command {
                      RT::PiEvent &Event);
   std::vector<EventImplPtr> prepareEvents(ContextImplPtr Context);
 
-  // Private interface. Derived classes should implement this method.
+  /// Private interface. Derived classes should implement this method.
   virtual cl_int enqueueImp() = 0;
 
-  // The type of the command
+  /// The type of the command.
   CommandType MType;
-  // Mutex used to protect enqueueing from race conditions
+  /// Mutex used to protect enqueueing from race conditions
   std::mutex MEnqueueMtx;
 
 public:
-  // Contains list of dependencies(edges)
+  /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
-  // Contains list of commands that depend on the command
+  /// Contains list of commands that depend on the command.
   std::unordered_set<Command *> MUsers;
-  // Indicates whether the command can be blocked from enqueueing
+  /// Indicates whether the command can be blocked from enqueueing.
   bool MIsBlockable = false;
-  // Counts the number of memory objects this command is a leaf for
+  /// Counts the number of memory objects this command is a leaf for.
   unsigned MLeafCounter = 0;
 
   const char *MBlockReason = "Unknown";
 
-  // Describes the status of a command
+  /// Describes the status of the command.
   std::atomic<EnqueueResultT::ResultT> MEnqueueStatus;
 
   // All member variable defined here  are needed for the SYCL instrumentation
   // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
   // to ensure we have the same object layout when the macro in the library and
   // SYCL app are not the same.
-  //
-  // The event for node_create and task_begin
+
+  /// The event for node_create and task_begin.
   void *MTraceEvent = nullptr;
-  // The stream under which the traces are emitted; stream ids are
-  // positive integers and we set it to an invalid value
+  /// The stream under which the traces are emitted.
+  ///
+  /// Stream ids are positive integers and we set it to an invalid value.
   int32_t MStreamID = -1;
-  // Reserved for storing the object address such as SPIRV or memory object
-  // address
+  /// Reserved for storing the object address such as SPIRV or memory object
+  /// address.
   void *MAddress = nullptr;
-  // Buffer to build the address string
+  /// Buffer to build the address string.
   string_class MAddressString;
-  // Buffer to build the command node type
+  /// Buffer to build the command node type.
   string_class MCommandNodeType;
-  // Buffer to build the command end-user understandable name
+  /// Buffer to build the command end-user understandable name.
   string_class MCommandName;
-  // Flag to indicate if makeTraceEventProlog() has been run
+  /// Flag to indicate if makeTraceEventProlog() has been run.
   bool MTraceEventPrologComplete = false;
-  // Flag to indicate if this is the first time we are seeing this payload
+  /// Flag to indicate if this is the first time we are seeing this payload.
   bool MFirstInstance = false;
-  // Instance ID tracked for the command
+  /// Instance ID tracked for the command.
   uint64_t MInstanceID = 0;
 };
 
-// The command does nothing during enqueue. The task can be used to implement
-// lock in the graph, or to merge several nodes into one.
+/// The empty command does nothing during enqueue. The task can be used to
+/// implement lock in the graph, or to merge several nodes into one.
 class EmptyCommand : public Command {
 public:
   EmptyCommand(QueueImplPtr Queue, Requirement Req);
@@ -239,8 +243,8 @@ class EmptyCommand : public Command {
   Requirement MRequirement;
 };
 
-// The command enqueues release instance of memory allocated on Host or
-// underlying framework.
+/// The release command enqueues release of a memory object instance allocated
+/// on Host or underlying framework.
 class ReleaseCommand : public Command {
 public:
   ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd);
@@ -251,10 +255,11 @@ class ReleaseCommand : public Command {
 private:
   cl_int enqueueImp() final;
 
-  // Command which allocates memory release command should dealocate
+  /// Command which allocates memory release command should dealocate.
   AllocaCommandBase *MAllocaCmd = nullptr;
 };
 
+/// Base class for memory allocation commands.
 class AllocaCommandBase : public Command {
 public:
   AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req,
@@ -272,17 +277,17 @@ class AllocaCommandBase : public Command {
 
   void *MMemAllocation = nullptr;
 
-  // Alloca command linked with current command.
-  // Device and host alloca commands can be linked, so they may share the same
-  // memory. Only one allocation from a pair can be accessed at a time. Alloca
-  // commands associated with such allocation is "active". In order to switch
-  // "active" status between alloca commands map/unmap operations are used.
+  /// Alloca command linked with current command.
+  /// Device and host alloca commands can be linked, so they may share the same
+  /// memory. Only one allocation from a pair can be accessed at a time. Alloca
+  /// commands associated with such allocation is "active". In order to switch
+  /// "active" status between alloca commands map/unmap operations are used.
   AllocaCommandBase *MLinkedAllocaCmd = nullptr;
-  // Indicates that current alloca is active one.
+  /// Indicates that current alloca is active one.
   bool MIsActive = true;
 
-  // Indicates that the command owns memory allocation in case of connected
-  // alloca command
+  /// Indicates that the command owns memory allocation in case of connected
+  /// alloca command.
   bool MIsLeaderAlloca = true;
 
 protected:
@@ -290,8 +295,8 @@ class AllocaCommandBase : public Command {
   ReleaseCommand MReleaseCmd;
 };
 
-// The command enqueues allocation of instance of memory object on Host or
-// underlying framework.
+/// The alloca command enqueues allocation of instance of memory object on Host
+/// or underlying framework.
 class AllocaCommand : public AllocaCommandBase {
 public:
   AllocaCommand(QueueImplPtr Queue, Requirement Req,
@@ -305,11 +310,12 @@ class AllocaCommand : public AllocaCommandBase {
 private:
   cl_int enqueueImp() final;
 
-  // The flag indicates that alloca should try to reuse pointer provided by
-  // the user during memory object construction
+  /// The flag indicates that alloca should try to reuse pointer provided by
+  /// the user during memory object construction.
   bool MInitFromUserData = false;
 };
 
+/// The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
 class AllocaSubBufCommand : public AllocaCommandBase {
 public:
   AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
@@ -326,6 +332,7 @@ class AllocaSubBufCommand : public AllocaCommandBase {
   AllocaCommandBase *MParentAlloca = nullptr;
 };
 
+/// The map command enqueues mapping of device memory onto host memory.
 class MapMemObject : public Command {
 public:
   MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr,
@@ -344,6 +351,7 @@ class MapMemObject : public Command {
   access::mode MMapMode;
 };
 
+/// The unmap command removes mapping of host memory onto device memory.
 class UnMapMemObject : public Command {
 public:
   UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req,
@@ -361,7 +369,8 @@ class UnMapMemObject : public Command {
   void **MSrcPtr = nullptr;
 };
 
-// The command enqueues memory copy between two instances of memory object.
+/// The mem copy command enqueues memory copy between two instances of memory
+/// object.
 class MemCpyCommand : public Command {
 public:
   MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd,
@@ -382,7 +391,8 @@ class MemCpyCommand : public Command {
   AllocaCommandBase *MDstAllocaCmd = nullptr;
 };
 
-// The command enqueues memory copy between two instances of memory object.
+/// The mem copy host command enqueues memory copy between two instances of
+/// memory object.
 class MemCpyCommandHost : public Command {
 public:
   MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd,
@@ -403,7 +413,8 @@ class MemCpyCommandHost : public Command {
   void **MDstPtr = nullptr;
 };
 
-// The command enqueues execution of kernel or explicit memory operation.
+/// The exec CG command enqueues execution of kernel or explicit memory
+/// operation.
 class ExecCGCommand : public Command {
 public:
   ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue);
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index c6934cebf9e00..4d749d92e9952 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -27,9 +27,10 @@ __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
 
-// The function checks whether two requirements overlaps or not. This
-// information can be used to prove that executing two kernels that
-// work on different parts of the memory object in parallel is legal.
+/// Checks whether two requirements overlap or not.
+///
+/// This information can be used to prove that executing two kernels that
+/// work on different parts of the memory object in parallel is legal.
 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
   return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
           RHS->MOffsetInBytes) ||
@@ -43,12 +44,12 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
   return LHS == RHS || (LHS->is_host() && RHS->is_host());
 }
 
-// The function checks if current requirement is requirement for sub buffer
+/// Checks if current requirement is requirement for sub buffer.
 static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
-// Checks if the required access mode is allowed under the current one
+/// Checks if the required access mode is allowed under the current one.
 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   switch (Current) {
   case access::mode::read:
@@ -104,7 +105,6 @@ static void printDotRecursive(std::fstream &Stream,
 
 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
   static size_t Counter = 0;
-
   std::string ModeNameStr(ModeName);
   std::string FileName =
       "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
@@ -123,13 +123,10 @@ void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
   Stream << "}" << std::endl;
 }
 
-// Returns record for the memory objects passed, nullptr if doesn't exist.
 MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) {
   return MemObject->MRecord.get();
 }
 
-// Returns record for the memory object requirement refers to, if doesn't
-// exist, creates new one.
 MemObjRecord *
 Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
                                                  Requirement *Req) {
@@ -147,7 +144,6 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
   return MemObject->MRecord.get();
 }
 
-// Helper function which removes all values in Cmds from Leaves
 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
                                            MemObjRecord *Record,
                                            access::mode AccessMode) {
@@ -398,7 +394,7 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) {
 // The function implements SYCL host accessor logic: host accessor
 // should provide access to the buffer in user space.
 Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
-                                                const bool destructor) {
+                                                  const bool destructor) {
 
   const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
 
@@ -450,15 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
   return insertMemoryMove(Record, Req, HostQueue);
 }
 
-// The functions finds dependencies for the requirement. It starts searching
-// from list of "leaf" commands for the record and check if the examining
-// command can be executed in parallel with new one with regard to the memory
-// object. If can, then continue searching through dependencies of that
-// command. There are several rules used:
-//
-// 1. New and examined commands only read -> can bypass
-// 2. New and examined commands has non-overlapping requirements -> can bypass
-// 3. New and examined commands has different contexts -> cannot bypass
+/// Start the search for the record from list of "leaf" commands and check if
+/// the examined command can be executed in parallel with the new one with
+/// regard to the memory object. If it can, then continue searching through
+/// dependencies of that command. There are several rules used:
+///
+/// 1. New and examined commands only read -> can bypass
+/// 2. New and examined commands has non-overlapping requirements -> can bypass
+/// 3. New and examined commands have different contexts -> cannot bypass
 std::set<Command *>
 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
                                         const ContextImplPtr &Context) {
@@ -607,8 +602,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
 
         // To ensure that the leader allocation is removed first
         AllocaCmd->getReleaseCmd()->addDep(
-            DepDesc(LinkedAllocaCmd->getReleaseCmd(), AllocaCmd->getRequirement(),
-                    LinkedAllocaCmd));
+            DepDesc(LinkedAllocaCmd->getReleaseCmd(),
+                    AllocaCmd->getRequirement(), LinkedAllocaCmd));
 
         // Device allocation takes ownership of the host ptr during
         // construction, host allocation doesn't. So, device allocation should
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 19c79ea5e4e18..5affe5bde4977 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -19,6 +19,152 @@
 #include <set>
 #include <vector>
 
+/// \defgroup sycl_graph DPC++ Execution Graph
+///
+/// SYCL, unlike OpenCL, provides a programming model in which the user doesn't
+/// need to manage dependencies between kernels and memory explicitly. The DPC++
+/// Runtime must ensure correct execution with respect to the order commands are
+/// submitted.
+///
+/// This document describes the part of the DPC++ Runtime that is responsible
+/// for building and processing dependency graph.
+///
+/// ## A couple of words about DPC++ and SYCL execution and memory model
+///
+/// The SYCL framework defines command group (\ref CG) as an entity that
+/// represents minimal execution block. The command group is submitted to SYCL
+/// queue and consists of a kernel or an explicit memory operation, and their
+/// requirements. The SYCL queue defines the device and context using which the
+/// kernel should be executed.
+///
+/// The commands that contain explicit memory operations include copy, fill,
+/// update_host and other operations. It's up to implementation how to define
+/// these operations.
+///
+/// The relative order of command groups submission defines the order in which
+/// kernels must be executed if their memory requirements intersect. For
+/// example, if a command group A writes to a buffer X, command group B reads
+/// from X, then the scheduled execution order of A and B will be the same as
+/// their dynamic submission order (matches program order if submitted from the
+/// same host thread).
+///
+/// Memory requirements are requests to SYCL memory objects, such as buffer and
+/// image. SYCL memory objects are not bound to any specific context or device,
+/// it's SYCL responsibility to allocate and/or copy memory to the target
+/// context to achieve correct execution.
+///
+/// Refer to SYCL Specification 1.2.1 sections 3.4 and 3.5 to find more
+/// information about SYCL execution and memory model.
+///
+/// ### Example of DPC++ application
+///
+/// \code{.cpp}
+/// {
+///   // Creating SYCL CPU and GPU queues
+///   cl::sycl::queue CPU_Queue = ...;
+///   cl::sycl::queue GPU_Queue = ...;
+///
+///   // Creating 3 SYCL buffers
+///   auto BufferA = ...; // Buffer is initialized with host memory.
+///   auto BufferB = ...;
+///   auto BufferC = ...;
+///
+///   // "Copy command group" section
+///   // Request processing explicit copy operation on CPU
+///   // The copy operation reads from BufferA and writes to BufferB
+///
+///   CPU_Queue.submit([&](handler &CGH) {
+///     auto A = BufferA.get_access<read>(CGH);
+///     auto B = BufferB.get_access<write>(CGH);
+///     CGH.copy(A, B);
+///   });
+///
+///   // "Multi command group" section
+///   // Request processing multi kernel on GPU
+///   // The kernel reads from BufferB, multiplies by 4 and writes result to
+///   // BufferC
+///
+///   GPU_Queue.submit([&](handler &CGH) {
+///     auto B = BufferB.get_access<read>(CGH);
+///     auto C = BufferC.get_access<write>(CGH);
+///     CGH.parallel_for<class multi>(range<1>{N}, [=](id<1> Index) {
+///       C[Index] = B[Index] * 4;
+///     });
+///   });
+///
+///   // "Host accessor creation" section
+///   // Request the latest data of BufferC for the moment
+///   // This is a synchronization point, which means that the DPC++ RT blocks
+///   // on creation of the accessor until requested data is available.
+///   auto C = BufferC.get_access<read>();
+/// }
+/// \endcode
+///
+/// In the example above the DPC++ RT does the following:
+///
+/// 1. **Copy command group**.
+///    The DPC++ RT allocates memory for BufferA and BufferB on CPU then
+///    executes an explicit copy operation on CPU.
+///
+/// 2. **Multi command group**
+///    DPC++ RT allocates memory for BufferC and BufferB on GPU and copy
+///    content of BufferB from CPU to GPU, then execute "multi" kernel on GPU.
+///
+/// 3. **Host accessor creation**
+///    DPC++ RT allocates(it's possible to reuse already allocated memory)
+///    memory available for user for BufferC then copy content of BufferC from
+///    GPU to this memory.
+///
+/// So, the example above will be converted to the following OpenCL pseudo code
+/// \code{.cpp}
+/// // Initialization(not related to the Scheduler)
+/// Platform = clGetPlatforms(...);
+/// DeviceCPU = clGetDevices(CL_DEVICE_TYPE_CPU, ...);
+/// DeviceGPU = clGetDevices(CL_DEVICE_TYPE_GPU, ...);
+/// ContextCPU = clCreateContext(DeviceCPU, ...)
+/// ContextGPU = clCreateContext(DeviceGPU, ...)
+/// QueueCPU = clCreateCommandQueue(ContextCPU, DeviceCPU, ...);
+/// QueueGPU = clCreateCommandQueue(ContextGPU, DeviceGPU, ...);
+///
+/// // Copy command group:
+/// BufferACPU = clCreateBuffer(ContextCPU, CL_MEM_USE_HOST_PTR, ...);
+/// BufferBCPU = clCreateBuffer(ContextCPU, ...);
+/// CopyEvent = clEnqueueCopyBuffer(QueueCPU, BufferACPU, BufferBCPU, ...)
+///
+/// // Multi command group:
+/// ReadBufferEvent =
+///    clEnqueueReadBuffer(QueueCPU, BufferBCPU, HostPtr, CopyEvent, ...);
+/// BufferBGPU = clCreateBuffer(ContextGPU, ...);
+///
+/// UserEvent = clCreateUserEvent(ContextCPU);
+/// clSetEventCallback(ReadBufferEvent, event_completion_callback,
+/// /*data=*/UserEvent);
+///
+/// WriteBufferEvent = clEnqueueWriteBuffer(QueueGPU, BufferBGPU, HostPtr,
+/// UserEvent, ...); BufferCGPU = clCreateBuffer(ContextGPU, ...); ProgramGPU =
+/// clCreateProgramWithIL(ContextGPU, ...); clBuildProgram(ProgramGPU);
+/// MultiKernel = clCreateKernel("multi");
+/// clSetKernelArg(MultiKernel, BufferBGPU, ...);
+/// clSetKernelArg(MultiKernel, BufferCGPU, ...);
+/// MultiEvent =
+///    clEnqueueNDRangeKernel(QueueGPU, MultiKernel, WriteBufferEvent, ...);
+///
+/// // Host accessor creation:
+/// clEnqueueMapBuffer(QueueGPU, BufferCGPU, BLOCKING_MAP, MultiEvent, ...);
+///
+/// // Releasing mem objects during SYCL buffers destruction.
+/// clReleaseBuffer(BufferACPU);
+/// clReleaseBuffer(BufferBCPU);
+/// clReleaseBuffer(BufferBGPU);
+/// clReleaseBuffer(BufferCGPU);
+///
+/// // Release(not related to the Scheduler)
+/// clReleaseKernel(MultiKernel);
+/// clReleaseProgram(ProgramGPU);
+/// clReleaseContext(ContextGPU);
+/// clReleaseContext(ContextCPU);
+/// \endcode
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
@@ -31,8 +177,12 @@ using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
 
-// The MemObjRecord is created for each memory object used in command
-// groups. There should be only one MemObjRecord for SYCL memory object.
+/// Memory Object Record
+///
+/// The MemObjRecord is used in command groups (todo better desc).
+/// There must be a single MemObjRecord for each SYCL memory object.
+///
+/// \ingroup sycl_graph
 struct MemObjRecord {
   MemObjRecord(ContextImplPtr CurContext, std::size_t LeafLimit)
       : MReadLeaves{LeafLimit}, MWriteLeaves{LeafLimit}, MCurContext{
@@ -59,46 +209,217 @@ struct MemObjRecord {
   bool MMemModified = false;
 };
 
+/// DPC++ graph scheduler class.
+///
+/// \section sched_overview Scheduler Overview
+///
+/// The Scheduler is a part of DPC++ RT which ensures correct execution of
+/// command groups. To achieve this Scheduler manages acyclic dependency graph
+/// (which can have independent sub-graphs) that consists of several types of
+/// nodes that represent specific commands:
+
+/// 1. Allocate memory.
+///    The command represents memory allocation operation. There can be
+///    multiple allocations for a single SYCL memory object.
+/// 2. Release memory.
+///    The command represents memory release operation.
+/// 3. Execute command group.
+///    The command represents \ref CG "Command Group" (kernel) execution
+///    operation.
+/// 4. Copy memory.
+///    The command represents memory copy operation between two memory
+///    allocations of a single memory object.
+///
+/// As the main input Scheduler takes a command group and returns an event
+/// representing it, so it can be waited on later. When a new
+/// command group comes, Scheduler adds one or more nodes to the graph
+/// depending on the command groups' requirements. For example, if a new
+/// command group is submitted to the SYCL context which has the latest data
+/// for all the requirements, Scheduler adds a new "Execute command group"
+/// command making it dependent on all commands affecting new command group's
+/// requirements. But if one of the requirements has no up-to-date instance in
+/// the context which the command group is submitted to, Scheduler
+/// additionally inserts copy memory command (together with allocate memory
+/// command if needed).
+///
+/// A simple graph looks like:
+//
+//    +----------+     +----------+     +----------+
+//    |          |     |          |     |          |
+//    | Allocate |<----| Execute  |<----| Execute  |
+//    |          |     |          |     |          |
+//    +----------+     +----------+     +----------+
+//
+/// \dot
+/// digraph G {
+///   rankdir="LR";
+///   Execute1 [label = "Execute"];
+///   Execute2 [label = "Execute"];
+///   Allocate;
+///   Allocate -> Execute2 [dir = back];
+///   Execute2 -> Execute1 [dir = back];
+/// }
+/// \enddot
+///
+/// Where nodes represent commands and edges represent dependencies between
+/// them. There are three commands connected by arrows which mean that before
+/// executing second command group the first one must be executed. Also before
+/// executing the first command group memory allocation must be performed.
+///
+/// At some point Scheduler enqueues commands to the underlying devices. To do
+/// this, Scheduler performs topological sort to get the order in which commands
+/// should be enqueued. For example, the following graph (D depends on B and C,
+/// B and C depends on A) will be enqueued in the following order:
+/// \code{.cpp}
+///   EventA = Enqueue(A, /*Deps=*/{});
+///   EventB = Enqueue(B, /*Deps=*/{EventA});
+///   EventC = Enqueue(C, /*Deps=*/{EventA});
+///   EventD = Enqueue(D, /*Deps=*/{EventB, EventC});
+/// \endcode
+///
+//                             +----------+
+//                             |          |
+//                             |    D     |
+//                             |          |
+//                             +----------+
+//                            /            \
+//                           /              \
+//                          v                v
+//                      +----------+     +----------+
+//                      |          |     |          |
+//                      |     B    |     |    C     |
+//                      |          |     |          |
+//                      +----------+     +----------+
+//                             \            /
+//                              \          /
+//                               v        v
+//                              +----------+
+//                              |          |
+//                              |    A     |
+//                              |          |
+//                              +----------+
+/// \dot
+/// digraph G {
+///   D -> B;
+///   D -> C;
+///   C -> A;
+///   B -> A;
+/// }
+/// \enddot
+///
+/// \section sched_impl Implementation details
+///
+/// The Scheduler is split up into two parts: graph builder and graph processor.
+///
+/// To build dependencies, Scheduler needs to memorize memory objects and
+/// commands that modify them.
+///
+/// To detect that two command groups access the same memory object and create
+/// a dependency between them, Scheduler needs to store information about
+/// the memory object.
+///
+/// \subsection sched_thread_safety Thread safety
+///
+/// To ensure thread safe execution of methods, Scheduler provides access to the
+/// graph that's guarded by a read-write mutex (analog of shared mutex from
+/// C++17).
+///
+/// A read-write mutex allows concurrent access to read-only operations, while
+/// write operations require exclusive access.
+///
+/// All the methods of GraphBuilder lock the mutex in write mode because these
+/// methods can modify the graph.
+/// Methods of GraphProcessor lock the mutex in read mode as they are not
+/// modifying the graph.
+///
+/// \subsection shced_err_handling Error handling
+///
+/// There are two sources of errors that needs to be handled in Scheduler:
+/// 1. errors that happen during command enqueue process
+/// 2. the error that happend during command execution.
+///
+/// If an error occurs during command enqueue process, the Command::enqueue
+/// method returns the faulty command. Scheduler then reschedules the command
+/// and all dependent commands (if any).
+///
+/// An error with command processing can happen in underlying runtime, in this
+/// case Scheduler is notified asynchronously (using callback mechanism) what
+/// triggers rescheduling.
+///
+/// \ingroup sycl_graph
 class Scheduler {
 public:
-  // Registers command group, adds it to the dependency graph and returns an
-  // event object that can be used for waiting later. It's called by SYCL's
-  // queue.submit.
+  /// Registers a command group, and adds it to the dependency graph.
+  ///
+  /// It's called by SYCL's queue.submit.
+  ///
+  /// \param CommandGroup is a unique_ptr to a command group to be added.
+  /// \return an event object to wait on for command group completion.
   EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
                      QueueImplPtr Queue);
 
+  /// Registers a command group, that copies most recent memory to the memory
+  /// pointed by the requirement.
+  ///
+  /// \param Req is a requirement that points to the memory where data is
+  /// needed.
+  /// \return an event object to wait on for copy finish.
   EventImplPtr addCopyBack(Requirement *Req);
 
-  // Blocking call that waits for the event passed. For the eager execution
-  // mode this method invokes corresponding function of device API. In the
-  // lazy execution mode the method may enqueue the command associated with
-  // the event passed and its dependency before calling device API.
+  /// Waits for the event.
+  ///
+  /// This operation is blocking. For eager execution mode this method invokes
+  /// corresponding function of device API.
+  ///
+  /// \param Event is a pointer to event to wait on.
   void waitForEvent(EventImplPtr Event);
 
-  // Removes buffer pointed by MemObj from the graph: ensures all commands
-  // accessing the memory objects are executed and triggers deallocation of
-  // all memory assigned to the memory object. It's called from the
-  // sycl::buffer and sycl::image destructors.
+  /// Removes buffer from the graph.
+  ///
+  /// The lifetime of memory object descriptor begins when the first command
+  /// group that uses the memory object is submitted and ends when
+  /// "removeMemoryObject(...)" method is called which means there will be no
+  /// command group that uses the memory object. When removeMemoryObject is
+  /// called Scheduler will enqueue and wait on all release commands associated
+  /// with the memory object, which effectively guarantees that all commands
+  /// accessing the memory object are complete and then the resources allocated
+  /// for the memory object are freed. Then all the commands affecting the
+  /// memory object are removed.
+  ///
+  /// This member function is used by \ref buffer and \ref image.
+  ///
+  /// \param MemObj is a memory object that points to the buffer being removed.
   void removeMemoryObject(detail::SYCLMemObjI *MemObj);
 
-  // Removes finished non-leaf non-alloca commands from the subgraph (assuming
-  // that all its commands have been waited for).
+  /// Removes finished non-leaf non-alloca commands from the subgraph (assuming
+  /// that all its commands have been waited for).
+  /// \sa GraphBuilder::cleanupFinishedCommands
+  ///
+  /// \param FinishedEvent is a cleanup candidate event.
   void cleanupFinishedCommands(EventImplPtr FinishedEvent);
 
-  // Creates nodes in the graph, that update Req with the pointer to the host
-  // memory which contains the latest data of the memory object. New
-  // operations with the same memory object that have side effects are blocked
-  // until releaseHostAccessor is called. Returns an event which indicates
-  // when these nodes are completed and host accessor is ready for using.
+  /// Adds nodes to the graph, that update the requirement with the pointer
+  /// to the host memory.
+  ///
+  /// Assumes the host pointer contains the latest data. New operations with
+  /// the same memory object that have side effects are blocked until
+  /// releaseHostAccessor(Requirement *Req) is callled.
+  ///
+  /// \param Req is the requirement to be updated.
+  /// \return an event which indicates when these nodes are completed
+  /// and host accessor is ready for use.
   EventImplPtr addHostAccessor(Requirement *Req, const bool Destructor = false);
 
-  // Unblocks operations with the memory object.
+  /// Unblocks operations with the memory object.
+  ///
+  /// \param Req is a requirement that points to the memory object being
+  /// unblocked.
   void releaseHostAccessor(Requirement *Req);
 
-  // Returns an instance of the scheduler object.
+  /// \return an instance of the scheduler object.
   static Scheduler &getInstance();
 
-  // Returns list of "immediate" dependencies for the Event given.
+  /// \return a vector of "immediate" dependencies for the Event given.
   std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
 
   QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
@@ -107,71 +428,98 @@ class Scheduler {
   Scheduler();
   static Scheduler instance;
 
-  // The graph builder provides interfaces that can change already existing
-  // graph (e.g. add/remove edges/nodes).
+  /// Graph builder class.
+  ///
+  /// The graph builder provides means to change an existing graph (e.g. add
+  /// or remove edges/nodes).
+  ///
+  /// \ingroup sycl_graph
   class GraphBuilder {
   public:
     GraphBuilder();
 
-    // Registers command group, adds it to the dependency graph and returns an
-    // command that represents command group execution. It's called by SYCL's
-    // queue::submit.
+    /// Registers \ref CG "command group" and adds it to the dependency graph.
+    ///
+    /// \sa queue::submit, Scheduler::addCG
+    ///
+    /// \return a command that represents command group execution.
     Command *addCG(std::unique_ptr<detail::CG> CommandGroup,
                    QueueImplPtr Queue);
 
+    /// Registers a \ref CG "command group" that updates host memory to the
+    /// latest state.
+    ///
+    /// \return a command that represents command group execution.
     Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
                              QueueImplPtr HostQueue);
 
+    /// Enqueues a command to update memory to the latest state.
+    ///
+    /// \param Req is a requirement, that describes memory object.
     Command *addCopyBack(Requirement *Req);
+
+    /// Enqueues a command to create a host accessor.
+    ///
+    /// \param Req points to memory being accessed.
     Command *addHostAccessor(Requirement *Req, const bool destructor = false);
 
-    // [Provisional] Optimizes the whole graph.
+    /// [Provisional] Optimizes the whole graph.
     void optimize();
 
-    // [Provisional] Optimizes subgraph that consists of command associated
-    // with Event passed and its dependencies.
+    /// [Provisional] Optimizes subgraph that consists of command associated
+    /// with Event passed and its dependencies.
     void optimize(EventImplPtr Event);
 
-    // Removes finished non-leaf non-alloca commands from the subgraph (assuming
-    // that all its commands have been waited for).
+    /// Removes finished non-leaf non-alloca commands from the subgraph
+    /// (assuming that all its commands have been waited for).
     void cleanupFinishedCommands(Command *FinishedCmd);
 
-    // Reschedules command passed using Queue provided. this can lead to
-    // rescheduling of all dependent commands. This can be used when user
-    // provides "secondary" queue to submit method which may be used when
-    // command fails to enqueue/execute in primary queue.
+    /// Reschedules the command passed using Queue provided.
+    ///
+    /// This can lead to rescheduling of all dependent commands. This can be
+    /// used when the user provides a "secondary" queue to the submit method
+    /// which may be used when the command fails to enqueue/execute in the
+    /// primary queue.
     void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
 
+    /// \return a pointer to the corresponding memory object record for the
+    /// SYCL memory object provided, or nullptr if it does not exist.
     MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject);
-    // Returns pointer to MemObjRecord for pointer to memory object.
-    // Return nullptr if there the record is not found.
+
+    /// \return a pointer to MemObjRecord for pointer to memory object. If the
+    /// record is not found, nullptr is returned.
     MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
                                           Requirement *Req);
 
-    // Decrements leaf counters for all leaves of the record.
+    /// Decrements leaf counters for all leaves of the record.
     void decrementLeafCountersForRecord(MemObjRecord *Record);
 
-    // Removes commands that use given MemObjRecord from the graph.
+    /// Removes commands that use the given MemObjRecord from the graph.
     void cleanupCommandsForRecord(MemObjRecord *Record);
 
-    // Removes MemObjRecord for memory object passed.
+    /// Removes the MemObjRecord for the memory object passed.
     void removeRecordForMemObj(SYCLMemObjI *MemObject);
 
-    // Add new command to leaves if needed.
+    /// Adds new command to leaves if needed.
     void addNodeToLeaves(MemObjRecord *Record, Command *Cmd,
                          access::mode AccessMode);
 
-    // Removes commands from leaves.
+    /// Removes commands from leaves.
     void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
                       access::mode AccessMode);
 
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
-    // The method inserts required command to make so the latest state for the
-    // memory object Record refers to resides in the context which is bound to
-    // the Queue. Can insert copy/map/unmap operations depending on the source
-    // and destination.
+    /// Inserts the command required to update the memory object state in the
+    /// context.
+    ///
+    /// Copy/map/unmap operations can be inserted depending on the source and
+    /// destination.
+    ///
+    /// \param Record is a memory object that needs to be updated.
+    /// \param Req is a Requirement describing destination.
+    /// \param Queue is a queue that is bound to target context.
     Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req,
                               const QueueImplPtr &Queue);
 
@@ -184,24 +532,30 @@ class Scheduler {
     insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req,
                            const QueueImplPtr &Queue);
 
+    /// Finds dependencies for the requirement.
     std::set<Command *> findDepsForReq(MemObjRecord *Record, Requirement *Req,
                                        const ContextImplPtr &Context);
 
-    // Finds a command dependency corresponding to the record
+    /// Finds a command dependency corresponding to the record.
     DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
 
-    // Searches for suitable alloca in memory record.
+    /// Searches for suitable alloca in memory record.
     AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req,
                                         const ContextImplPtr &Context);
-    // Searches for suitable alloca in memory record.
-    // If none found, creates new one.
+
+    /// Searches for suitable alloca in memory record.
+    ///
+    /// If none found, creates new one.
     AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record,
                                                Requirement *Req,
                                                QueueImplPtr Queue);
 
     void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
 
-    // Print contents of graph to text file in DOT format
+    /// Prints contents of graph to text file in DOT format
+    ///
+    /// \param ModeName is a stringified printing mode name to be used
+    /// in the result file name.
     void printGraphAsDot(const char *ModeName);
     enum PrintOptions {
       BeforeAddCG = 0,
@@ -215,21 +569,91 @@ class Scheduler {
     std::array<bool, PrintOptions::Size> MPrintOptionsArray;
   };
 
-  // The class that provides interfaces for enqueueing command and its
-  // dependencies to the underlying runtime. Methods of this class must not
-  // modify the graph.
+  /// Graph Processor provides interfaces for enqueueing commands and their
+  /// dependencies to the underlying runtime.
+  ///
+  /// Member functions of this class do not modify the graph.
+  ///
+  /// \section sched_enqueue Command enqueueing
+  ///
+  /// Commands are enqueued whenever they come to the Scheduler. Each command
+  /// has enqueue method which takes vector of events that represents
+  /// dependencies and returns event which represents the command.
+  /// GraphProcessor performs topological sort to get the order in which
+  /// commands have to be enqueued. Then it enqueues each command, passing a
+  /// vector of events that this command needs to wait on. If an error happens
+  /// during command enqueue, the whole process is stopped, the faulty command
+  /// is propagated back to the Scheduler.
+  ///
+  /// The command with dependencies that belong to a context different from its
+  /// own can't be enqueued directly (limitation of OpenCL runtime).
+  /// Instead, for each dependency, a proxy event is created in the target
+  /// context and linked using OpenCL callback mechanism with original one.
+  /// For example, the following SYCL code:
+  ///
+  /// \code{.cpp}
+  ///   // The ContextA and ContextB are different OpenCL contexts
+  ///   sycl::queue Q1(ContextA);
+  ///   sycl::queue Q2(ContextB);
+  ///
+  ///   Q1.submit(Task1);
+  ///
+  ///   Q2.submit(Task2);
+  /// \endcode
+  ///
+  /// is translated to the following OCL API calls:
+  ///
+  /// \code{.cpp}
+  ///   void event_completion_callback(void *data) {
+  ///     // Change status of event to complete.
+  ///     clSetEventStatus((cl_event *)data, CL_COMPLETE); // Scope of Context2
+  ///   }
+  ///
+  ///  // Enqueue TASK1
+  ///  EventTask1 = clEnqueueNDRangeKernel(Q1, TASK1, ..); // Scope of Context1
+  ///  // Read memory to host
+  ///  ReadMem = clEnqueueReadBuffer(A, .., /*Deps=*/EventTask1); // Scope of
+  ///  // Context1
+  ///
+  ///  // Create user event with initial status "not completed".
+  ///  UserEvent = clCreateUserEvent(Context2); // Scope of Context2
+  ///  // Ask OpenCL to call callback with UserEvent as data when "read memory
+  ///  // to host" operation is completed
+  ///  clSetEventCallback(ReadMem, event_completion_callback,
+  ///                     /*data=*/UserEvent); // Scope of Context1
+  ///
+  ///  // Enqueue write memory from host, block it on user event
+  ///  // It will be unblocked when we change UserEvent status to completed in
+  ///  // callback.
+  ///  WriteMem =
+  ///     clEnqueueWriteBuffer(A, .., /*Dep=*/UserEvent); // Scope of Context2
+  ///  // Enqueue TASK2
+  ///  EventTask2 =
+  ///     clEnqueueNDRangeKernel(TASK, .., /*Dep=*/WriteMem); // Scope of
+  ///     // Context2
+  /// \endcode
+  ///
+  /// The alternative approach that has been considered is to have separate
+  /// dispatcher thread that would wait for all events from the Context other
+  /// then target Context to complete and then enqueue command with dependencies
+  /// from target Context only. Alternative approach makes code significantly
+  /// more complex and can hurt performance on CPU device vs chosen approach
+  /// with callbacks.
+  ///
+  /// \ingroup sycl_graph
   class GraphProcessor {
   public:
-    // Returns a list of events that represent immediate dependencies of the
-    // command associated with Event passed.
+    /// \return a list of events that represent immediate dependencies of the
+    /// command associated with Event passed.
     static std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
 
-    // Wait for the command, associated with Event passed, is completed.
+    /// Waits for the command, associated with Event passed, is completed.
     static void waitForEvent(EventImplPtr Event);
 
-    // Enqueue the command passed and all it's dependencies to the underlying
-    // device. Returns true is the command is successfully enqueued. Sets
-    // EnqueueResult to the specific status otherwise.
+    /// Enqueues the command and all its dependencies.
+    ///
+    /// \param EnqueueResult is set to specific status if enqueue failed.
+    /// \return true if the command is successfully enqueued.
     static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult,
                                BlockingT Blocking = NON_BLOCKING);
   };
@@ -237,7 +661,7 @@ class Scheduler {
   void waitForRecordToFinish(MemObjRecord *Record);
 
   GraphBuilder MGraphBuilder;
-  // Use read-write mutex in future.
+  // TODO Use read-write mutex in future.
   std::mutex MGraphLock;
 
   QueueImplPtr DefaultHostQueue;