@@ -1629,6 +1629,65 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) {
16291629 }
16301630}
16311631
1632+ pi_result ExecCGCommand::SetKernelParamsAndLaunch (
1633+ CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc,
1634+ std::vector<RT::PiEvent> &RawEvents, RT::PiEvent &Event) {
1635+ const detail::plugin &Plugin = MQueue->getPlugin ();
1636+ for (ArgDesc &Arg : ExecKernel->MArgs ) {
1637+ switch (Arg.MType ) {
1638+ case kernel_param_kind_t ::kind_accessor: {
1639+ Requirement *Req = (Requirement *)(Arg.MPtr );
1640+ AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
1641+ RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation ();
1642+ if (Plugin.getBackend () == backend::opencl) {
1643+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1644+ sizeof (RT::PiMem), &MemArg);
1645+ } else {
1646+ Plugin.call <PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex ,
1647+ &MemArg);
1648+ }
1649+ break ;
1650+ }
1651+ case kernel_param_kind_t ::kind_std_layout: {
1652+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex , Arg.MSize ,
1653+ Arg.MPtr );
1654+ break ;
1655+ }
1656+ case kernel_param_kind_t ::kind_sampler: {
1657+ sampler *SamplerPtr = (sampler *)Arg.MPtr ;
1658+ RT::PiSampler Sampler = detail::getSyclObjImpl (*SamplerPtr)
1659+ ->getOrCreateSampler (MQueue->get_context ());
1660+ Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1661+ sizeof (cl_sampler), &Sampler);
1662+ break ;
1663+ }
1664+ case kernel_param_kind_t ::kind_pointer: {
1665+ Plugin.call <PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex ,
1666+ Arg.MSize , Arg.MPtr );
1667+ break ;
1668+ }
1669+ }
1670+ }
1671+
1672+ adjustNDRangePerKernel (NDRDesc, Kernel,
1673+ *(detail::getSyclObjImpl (MQueue->get_device ())));
1674+
1675+ // Some PI Plugins (like OpenCL) require this call to enable USM
1676+ // For others, PI will turn this into a NOP.
1677+ Plugin.call <PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1678+ sizeof (pi_bool), &PI_TRUE);
1679+
1680+ // Remember this information before the range dimensions are reversed
1681+ const bool HasLocalSize = (NDRDesc.LocalSize [0 ] != 0 );
1682+
1683+ ReverseRangeDimensionsForKernel (NDRDesc);
1684+ pi_result Error = Plugin.call_nocheck <PiApiKind::piEnqueueKernelLaunch>(
1685+ MQueue->getHandleRef (), Kernel, NDRDesc.Dims , &NDRDesc.GlobalOffset [0 ],
1686+ &NDRDesc.GlobalSize [0 ], HasLocalSize ? &NDRDesc.LocalSize [0 ] : nullptr ,
1687+ RawEvents.size (), RawEvents.empty () ? nullptr : &RawEvents[0 ], &Event);
1688+ return Error;
1689+ }
1690+
16321691// The function initialize accessors and calls lambda.
16331692// The function is used as argument to piEnqueueNativeKernel which requires
16341693// that the passed function takes one void* argument.
@@ -1809,71 +1868,42 @@ cl_int ExecCGCommand::enqueueImp() {
18091868
18101869 // Run OpenCL kernel
18111870 sycl::context Context = MQueue->get_context ();
1812- const detail::plugin &Plugin = MQueue->getPlugin ();
18131871 RT::PiKernel Kernel = nullptr ;
1872+ std::mutex *KernelMutex = nullptr ;
18141873
18151874 if (nullptr != ExecKernel->MSyclKernel ) {
18161875 assert (ExecKernel->MSyclKernel ->get_info <info::kernel::context>() ==
18171876 Context);
18181877 Kernel = ExecKernel->MSyclKernel ->getHandleRef ();
1819- } else
1820- Kernel = detail::ProgramManager::getInstance ().getOrCreateKernel (
1821- ExecKernel->MOSModuleHandle , Context, ExecKernel->MKernelName ,
1822- nullptr );
18231878
1824- for (ArgDesc &Arg : ExecKernel->MArgs ) {
1825- switch (Arg.MType ) {
1826- case kernel_param_kind_t ::kind_accessor: {
1827- Requirement *Req = (Requirement *)(Arg.MPtr );
1828- AllocaCommandBase *AllocaCmd = getAllocaForReq (Req);
1829- RT::PiMem MemArg = (RT::PiMem)AllocaCmd->getMemAllocation ();
1830- if (Plugin.getBackend () == backend::opencl) {
1831- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1832- sizeof (RT::PiMem), &MemArg);
1833- } else {
1834- Plugin.call <PiApiKind::piextKernelSetArgMemObj>(Kernel, Arg.MIndex ,
1835- &MemArg);
1836- }
1837- break ;
1838- }
1839- case kernel_param_kind_t ::kind_std_layout: {
1840- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex , Arg.MSize ,
1841- Arg.MPtr );
1842- break ;
1843- }
1844- case kernel_param_kind_t ::kind_sampler: {
1845- sampler *SamplerPtr = (sampler *)Arg.MPtr ;
1846- RT::PiSampler Sampler =
1847- detail::getSyclObjImpl (*SamplerPtr)->getOrCreateSampler (Context);
1848- Plugin.call <PiApiKind::piKernelSetArg>(Kernel, Arg.MIndex ,
1849- sizeof (cl_sampler), &Sampler);
1850- break ;
1851- }
1852- case kernel_param_kind_t ::kind_pointer: {
1853- Plugin.call <PiApiKind::piextKernelSetArgPointer>(Kernel, Arg.MIndex ,
1854- Arg.MSize , Arg.MPtr );
1855- break ;
1856- }
1879+ auto SyclProg = detail::getSyclObjImpl (
1880+ ExecKernel->MSyclKernel ->get_info <info::kernel::program>());
1881+ if (SyclProg->is_cacheable ()) {
1882+ RT::PiKernel FoundKernel = nullptr ;
1883+ std::tie (FoundKernel, KernelMutex) =
1884+ detail::ProgramManager::getInstance ().getOrCreateKernel (
1885+ ExecKernel->MOSModuleHandle ,
1886+ ExecKernel->MSyclKernel ->get_info <info::kernel::context>(),
1887+ ExecKernel->MKernelName , SyclProg.get ());
1888+ assert (FoundKernel == Kernel);
18571889 }
1890+ } else {
1891+ std::tie (Kernel, KernelMutex) =
1892+ detail::ProgramManager::getInstance ().getOrCreateKernel (
1893+ ExecKernel->MOSModuleHandle , Context, ExecKernel->MKernelName ,
1894+ nullptr );
18581895 }
18591896
1860- adjustNDRangePerKernel (NDRDesc, Kernel,
1861- *(detail::getSyclObjImpl (MQueue->get_device ())));
1862-
1863- // Some PI Plugins (like OpenCL) require this call to enable USM
1864- // For others, PI will turn this into a NOP.
1865- Plugin.call <PiApiKind::piKernelSetExecInfo>(Kernel, PI_USM_INDIRECT_ACCESS,
1866- sizeof (pi_bool), &PI_TRUE);
1867-
1868- // Remember this information before the range dimensions are reversed
1869- const bool HasLocalSize = (NDRDesc.LocalSize [0 ] != 0 );
1870-
1871- ReverseRangeDimensionsForKernel (NDRDesc);
1872-
1873- pi_result Error = Plugin.call_nocheck <PiApiKind::piEnqueueKernelLaunch>(
1874- MQueue->getHandleRef (), Kernel, NDRDesc.Dims , &NDRDesc.GlobalOffset [0 ],
1875- &NDRDesc.GlobalSize [0 ], HasLocalSize ? &NDRDesc.LocalSize [0 ] : nullptr ,
1876- RawEvents.size (), RawEvents.empty () ? nullptr : &RawEvents[0 ], &Event);
1897+ pi_result Error = PI_SUCCESS;
1898+ if (KernelMutex != nullptr ) {
1899+ // For cacheable kernels, we use per-kernel mutex
1900+ std::lock_guard<std::mutex> Lock (*KernelMutex);
1901+ Error = SetKernelParamsAndLaunch (ExecKernel, Kernel, NDRDesc, RawEvents,
1902+ Event);
1903+ } else {
1904+ Error = SetKernelParamsAndLaunch (ExecKernel, Kernel, NDRDesc, RawEvents,
1905+ Event);
1906+ }
18771907
18781908 if (PI_SUCCESS != Error) {
18791909 // If we have got non-success error code, let's analyze it to emit nice
0 commit comments