Skip to content

Commit e6fd911

Browse files
authored
[SYCL] Fix hang in ProgramManager class (#2274)
This patch fixes sporadic hangs caused by data race in NativePrograms variable of the ProgramManager class. The current mutex lock from the program cache is not thread-safe. When multiple threads use their own queues then they use different contexts and then they use their own program caches. And since multiple threads can queue own command of kernel execution in parallel, then each thread will lock its own program cache mutex but the access into NativePrograms variable remains with a race between the threads. Signed-off-by: Alexander Flegontov <[email protected]>
1 parent 3323c74 commit e6fd911

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ RT::PiProgram ProgramManager::createPIProgram(const RTDeviceBinaryImage &Img,
331331
: createBinaryProgram(Ctx, RawImg.BinaryStart, ImgSize);
332332

333333
{
334-
auto LockGuard = Ctx->getKernelProgramCache().acquireCachedPrograms();
334+
std::lock_guard<std::mutex> Lock(MNativeProgramsMutex);
335335
// associate the PI program with the image it was created for
336336
NativePrograms[Res] = &Img;
337337
}
@@ -984,8 +984,7 @@ void ProgramManager::flushSpecConstants(const program_impl &Prg,
984984
if (!Img) {
985985
// caller hasn't provided the image object - find it
986986
{ // make sure NativePrograms map access is synchronized
987-
ContextImplPtr Ctx = getSyclObjImpl(Prg.get_context());
988-
auto LockGuard = Ctx->getKernelProgramCache().acquireCachedPrograms();
987+
std::lock_guard<std::mutex> Lock(MNativeProgramsMutex);
989988
auto It = NativePrograms.find(NativePrg);
990989
if (It == NativePrograms.end())
991990
throw sycl::experimental::spec_const_error(

sycl/source/detail/program_manager/program_manager.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,11 @@ class ProgramManager {
181181
// NOTE: keys in the map can be invalid (reference count went to zero and
182182
// the underlying program disposed of), so the map can't be used in any way
183183
// other than binary image lookup with known live PiProgram as the key.
184-
// NOTE: access is synchronized via the same lock as program cache
184+
// NOTE: access is synchronized via the MNativeProgramsMutex
185185
std::unordered_map<pi::PiProgram, const RTDeviceBinaryImage *> NativePrograms;
186186

187+
/// Protects NativePrograms that can be changed by class' methods.
188+
std::mutex MNativeProgramsMutex;
187189
/// True iff a SPIRV file has been specified with an environment variable
188190
bool m_UseSpvFile = false;
189191
};

0 commit comments

Comments
 (0)