diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index bc2e74800de8f..24bc38ae7e241 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -58,8 +58,15 @@ class KernelProgramCache { using ContextPtr = context_impl *; using PiKernelT = std::remove_pointer::type; + + struct BuildResultKernel : public BuildResult { + std::mutex MKernelMutex; + + BuildResultKernel(PiKernelT *P, int S) : BuildResult(P, S) {} + }; + using PiKernelPtrT = std::atomic; - using KernelWithBuildStateT = BuildResult; + using KernelWithBuildStateT = BuildResultKernel; using KernelByNameT = std::map; using KernelCacheT = std::map; diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index f174dfabe1570..ff90729b80367 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -402,8 +402,9 @@ RT::PiKernel program_impl::get_pi_kernel(const string_class &KernelName) const { RT::PiKernel Kernel; if (is_cacheable()) { - Kernel = ProgramManager::getInstance().getOrCreateKernel( - MProgramModuleHandle, get_context(), KernelName, this); + std::tie(Kernel, std::ignore) = + ProgramManager::getInstance().getOrCreateKernel( + MProgramModuleHandle, get_context(), KernelName, this); getPlugin().call(Kernel); } else { const detail::plugin &Plugin = getPlugin(); diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index eee8497ad2c1f..2f58a01b8252c 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -318,6 +318,9 @@ class program_impl { /// Tells whether a specialization constant has been set for this program. bool hasSetSpecConstants() const { return !SpecConstRegistry.empty(); } + /// \return true if caching is allowed for this program. + bool is_cacheable() const { return MProgramAndKernelCachingAllowed; } + /// Returns the native plugin handle. pi_native_handle getNative() const; @@ -371,9 +374,6 @@ class program_impl { /// \return a vector of devices managed by the plugin. vector_class get_pi_devices() const; - /// \return true if caching is allowed for this program. - bool is_cacheable() const { return MProgramAndKernelCachingAllowed; } - /// \param Options is a string containing OpenCL C build options. /// \return true if caching is allowed for this program and build options. static bool is_cacheable_with_options(const string_class &Options) { diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 1d788a9bbc1fb..2867b70e57d8a 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -160,8 +160,9 @@ RetT *waitUntilBuilt(KernelProgramCache &Cache, /// cache. Accepts nothing. Return pointer to built entity. template -RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, - AcquireFT &&Acquire, GetCacheFT &&GetCache, BuildFT &&Build) { +KernelProgramCache::BuildResult * +getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire, + GetCacheFT &&GetCache, BuildFT &&Build) { bool InsertionTookPlace; KernelProgramCache::BuildResult *BuildResult; @@ -183,7 +184,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, RetT *Result = waitUntilBuilt(KPCache, BuildResult); if (Result) - return Result; + return BuildResult; // Previous build is failed. There was no SYCL exception though. // We might try to build once more. @@ -213,7 +214,7 @@ RetT *getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, KPCache.notifyAllBuild(); - return Desired; + return BuildResult; } catch (const exception &Ex) { BuildResult->Error.Msg = Ex.what(); BuildResult->Error.Code = Ex.get_cl_code(); @@ -395,14 +396,15 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M, if (Prg) Prg->stableSerializeSpecConstRegistry(SpecConsts); - return getOrBuild( + auto BuildResult = getOrBuild( Cache, KeyT(std::move(SpecConsts), KSId), AcquireF, GetF, BuildF); + return BuildResult->Ptr.load(); } -RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M, - const context &Context, - const string_class &KernelName, - const program_impl *Prg) { +std::pair +ProgramManager::getOrCreateKernel(OSModuleHandle M, const context &Context, + const string_class &KernelName, + const program_impl *Prg) { if (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::getOrCreateKernel(" << M << ", " << getRawSyclObjImpl(Context) << ", " << KernelName << ")\n"; @@ -436,8 +438,10 @@ RT::PiKernel ProgramManager::getOrCreateKernel(OSModuleHandle M, return Result; }; - return getOrBuild(Cache, KernelName, - AcquireF, GetF, BuildF); + auto BuildResult = static_cast( + getOrBuild(Cache, KernelName, AcquireF, + GetF, BuildF)); + return std::make_pair(BuildResult->Ptr.load(), &(BuildResult->MKernelMutex)); } RT::PiProgram diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 721de1971aa0d..08b53582257c3 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -81,9 +81,9 @@ class ProgramManager { const string_class &KernelName, const program_impl *Prg = nullptr, bool JITCompilationIsRequired = false); - RT::PiKernel getOrCreateKernel(OSModuleHandle M, const context &Context, - const string_class &KernelName, - const program_impl *Prg); + std::pair + getOrCreateKernel(OSModuleHandle M, const context &Context, + const string_class &KernelName, const program_impl *Prg); RT::PiProgram getPiProgramFromPiKernel(RT::PiKernel Kernel, const ContextImplPtr Context); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 81870a3ee6afa..e836336b98449 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1629,6 +1629,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { } } +pi_result ExecCGCommand::SetKernelParamsAndLaunch( + CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, RT::PiEvent &Event) { + const detail::plugin &Plugin = MQueue->getPlugin(); + for (ArgDesc &Arg : ExecKernel->MArgs) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = (Requirement *)(Arg.MPtr); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); + if (Plugin.getBackend() == backend::opencl) { + Plugin.call(Kernel, Arg.MIndex, + sizeof(RT::PiMem), &MemArg); + } else { + Plugin.call(Kernel, Arg.MIndex, + &MemArg); + } + break; + } + case kernel_param_kind_t::kind_std_layout: { + Plugin.call(Kernel, Arg.MIndex, Arg.MSize, + Arg.MPtr); + break; + } + case kernel_param_kind_t::kind_sampler: { + sampler *SamplerPtr = (sampler *)Arg.MPtr; + RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) + ->getOrCreateSampler(MQueue->get_context()); + Plugin.call(Kernel, Arg.MIndex, + sizeof(cl_sampler), &Sampler); + break; + } + case kernel_param_kind_t::kind_pointer: { + Plugin.call(Kernel, Arg.MIndex, + Arg.MSize, Arg.MPtr); + break; + } + } + } + + adjustNDRangePerKernel(NDRDesc, Kernel, + *(detail::getSyclObjImpl(MQueue->get_device()))); + + // Some PI Plugins (like OpenCL) require this call to enable USM + // For others, PI will turn this into a NOP. + Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, + sizeof(pi_bool), &PI_TRUE); + + // Remember this information before the range dimensions are reversed + const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); + + ReverseRangeDimensionsForKernel(NDRDesc); + pi_result Error = Plugin.call_nocheck( + MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, + RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + return Error; +} + // The function initialize accessors and calls lambda. // The function is used as argument to piEnqueueNativeKernel which requires // that the passed function takes one void* argument. @@ -1809,71 +1868,42 @@ cl_int ExecCGCommand::enqueueImp() { // Run OpenCL kernel sycl::context Context = MQueue->get_context(); - const detail::plugin &Plugin = MQueue->getPlugin(); RT::PiKernel Kernel = nullptr; + std::mutex *KernelMutex = nullptr; if (nullptr != ExecKernel->MSyclKernel) { assert(ExecKernel->MSyclKernel->get_info() == Context); Kernel = ExecKernel->MSyclKernel->getHandleRef(); - } else - Kernel = detail::ProgramManager::getInstance().getOrCreateKernel( - ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, - nullptr); - for (ArgDesc &Arg : ExecKernel->MArgs) { - switch (Arg.MType) { - case kernel_param_kind_t::kind_accessor: { - Requirement *Req = (Requirement *)(Arg.MPtr); - AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation(); - if (Plugin.getBackend() == backend::opencl) { - Plugin.call(Kernel, Arg.MIndex, - sizeof(RT::PiMem), &MemArg); - } else { - Plugin.call(Kernel, Arg.MIndex, - &MemArg); - } - break; - } - case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, Arg.MIndex, Arg.MSize, - Arg.MPtr); - break; - } - case kernel_param_kind_t::kind_sampler: { - sampler *SamplerPtr = (sampler *)Arg.MPtr; - RT::PiSampler Sampler = - detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context); - Plugin.call(Kernel, Arg.MIndex, - sizeof(cl_sampler), &Sampler); - break; - } - case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, Arg.MIndex, - Arg.MSize, Arg.MPtr); - break; - } + auto SyclProg = detail::getSyclObjImpl( + ExecKernel->MSyclKernel->get_info()); + if (SyclProg->is_cacheable()) { + RT::PiKernel FoundKernel = nullptr; + std::tie(FoundKernel, KernelMutex) = + detail::ProgramManager::getInstance().getOrCreateKernel( + ExecKernel->MOSModuleHandle, + ExecKernel->MSyclKernel->get_info(), + ExecKernel->MKernelName, SyclProg.get()); + assert(FoundKernel == Kernel); } + } else { + std::tie(Kernel, KernelMutex) = + detail::ProgramManager::getInstance().getOrCreateKernel( + ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, + nullptr); } - adjustNDRangePerKernel(NDRDesc, Kernel, - *(detail::getSyclObjImpl(MQueue->get_device()))); - - // Some PI Plugins (like OpenCL) require this call to enable USM - // For others, PI will turn this into a NOP. - Plugin.call(Kernel, PI_USM_INDIRECT_ACCESS, - sizeof(pi_bool), &PI_TRUE); - - // Remember this information before the range dimensions are reversed - const bool HasLocalSize = (NDRDesc.LocalSize[0] != 0); - - ReverseRangeDimensionsForKernel(NDRDesc); - - pi_result Error = Plugin.call_nocheck( - MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], HasLocalSize ? &NDRDesc.LocalSize[0] : nullptr, - RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], &Event); + pi_result Error = PI_SUCCESS; + if (KernelMutex != nullptr) { + // For cacheable kernels, we use per-kernel mutex + std::lock_guard Lock(*KernelMutex); + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, + Event); + } else { + Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, + Event); + } if (PI_SUCCESS != Error) { // If we have got non-success error code, let's analyze it to emit nice diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index b809c1503a397..76542bf4d1fa6 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -482,6 +482,11 @@ class ExecCGCommand : public Command { AllocaCommandBase *getAllocaForReq(Requirement *Req); + pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel, + RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, + RT::PiEvent &Event); + std::unique_ptr MCommandGroup; friend class Command;