From bc4312b378ca7b896054851f7d6a5f332378ab6a Mon Sep 17 00:00:00 2001 From: aarongreig Date: Mon, 23 Dec 2024 18:26:58 +0000 Subject: [PATCH 01/14] Merge pull request #2498 from Bensuo/fabio/fix_l0_old_loader_no_translate Update usage of zeCommandListImmediateAppendCommandListsExp to use dlsym --- source/adapters/level_zero/command_buffer.cpp | 25 +++++++++---------- source/adapters/level_zero/platform.cpp | 24 ++++++++++++++++++ source/adapters/level_zero/platform.hpp | 9 ++++++- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 32eff7e141..a03a0a7f70 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -26,14 +26,9 @@ namespace { // given Context and Device. bool checkImmediateAppendSupport(ur_context_handle_t Context, ur_device_handle_t Device) { - // TODO The L0 driver is not reporting this extension yet. Once it does, - // switch to using the variable zeDriverImmediateCommandListAppendFound. - // Minimum version that supports zeCommandListImmediateAppendCommandListsExp. - constexpr uint32_t MinDriverVersion = 30898; bool DriverSupportsImmediateAppend = - Context->getPlatform()->isDriverVersionNewerOrSimilar(1, 3, - MinDriverVersion); + Context->getPlatform()->ZeCommandListImmediateAppendExt.Supported; // If this environment variable is: // - Set to 1: the immediate append path will always be enabled as long the @@ -58,10 +53,8 @@ bool checkImmediateAppendSupport(ur_context_handle_t Context, if (EnableAppendPath && !DriverSupportsImmediateAppend) { logger::error("{} is set but " "the current driver does not support the " - "zeCommandListImmediateAppendCommandListsExp entrypoint. A " - "driver version of at least {} is required to use the " - "immediate append path.", - AppendEnvVarName, MinDriverVersion); + "zeCommandListImmediateAppendCommandListsExp entrypoint.", + AppendEnvVarName); std::abort(); } @@ -1568,7 +1561,10 @@ ur_result_t enqueueImmediateAppendPath( ur_event_handle_t *Event, ur_command_list_ptr_t CommandListHelper, bool DoProfiling) { + ur_platform_handle_t Platform = CommandBuffer->Context->getPlatform(); + assert(CommandListHelper->second.IsImmediate); + assert(Platform->ZeCommandListImmediateAppendExt.Supported); _ur_ze_event_list_t UrZeEventList; if (NumEventsInWaitList) { @@ -1586,7 +1582,8 @@ ur_result_t enqueueImmediateAppendPath( nullptr /*ForcedCmdQueue*/)); assert(ZeCopyEngineImmediateListHelper->second.IsImmediate); - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt + .zeCommandListImmediateAppendCommandListsExp, (ZeCopyEngineImmediateListHelper->first, 1, &CommandBuffer->ZeCopyCommandList, nullptr, UrZeEventList.Length, UrZeEventList.ZeEventList)); @@ -1598,7 +1595,8 @@ ur_result_t enqueueImmediateAppendPath( ze_event_handle_t &EventToSignal = DoProfiling ? CommandBuffer->ComputeFinishedEvent->ZeEvent : (*Event)->ZeEvent; - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt + .zeCommandListImmediateAppendCommandListsExp, (CommandListHelper->first, 1, &CommandBuffer->ZeComputeCommandList, EventToSignal, WaitList.Length, WaitList.ZeEventList)); @@ -1615,7 +1613,8 @@ ur_result_t enqueueImmediateAppendPath( (CommandListHelper->first, CommandBuffer->ExecutionFinishedEvent->ZeEvent, 0, nullptr)); - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + ZE2UR_CALL(Platform->ZeCommandListImmediateAppendExt + .zeCommandListImmediateAppendCommandListsExp, (CommandListHelper->first, 1, &CommandBuffer->ZeCommandListResetEvents, nullptr, 0, nullptr)); } diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index 2bfc9302db..e383e77294 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -222,6 +222,7 @@ ur_result_t ur_platform_handle_t_::initialize() { bool MutableCommandListSpecExtensionSupported = false; bool ZeIntelExternalSemaphoreExtensionSupported = false; + bool ZeImmediateCommandListAppendExtensionFound = false; for (auto &extension : ZeExtensions) { // Check if global offset extension is available if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME, @@ -246,6 +247,14 @@ ur_result_t ur_platform_handle_t_::initialize() { ZeDriverEventPoolCountingEventsExtensionFound = true; } } + // Check if the ImmediateAppendCommandLists extension is available. + if (strncmp(extension.name, ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME, + strlen(ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME) + 1) == 0) { + if (extension.version == + ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_VERSION_CURRENT) { + ZeImmediateCommandListAppendExtensionFound = true; + } + } // Check if extension is available for Mutable Command List v1.1. if (strncmp(extension.name, ZE_MUTABLE_COMMAND_LIST_EXP_NAME, strlen(ZE_MUTABLE_COMMAND_LIST_EXP_NAME) + 1) == 0) { @@ -425,6 +434,21 @@ ur_result_t ur_platform_handle_t_::initialize() { &ZeMutableCmdListExt .zexCommandListGetNextCommandIdWithKernelsExp))) == 0); } + + // Check if ImmediateAppendCommandList is supported and initialize the + // function pointer. + if (ZeImmediateCommandListAppendExtensionFound) { + ZeCommandListImmediateAppendExt + .zeCommandListImmediateAppendCommandListsExp = + (ze_pfnCommandListImmediateAppendCommandListsExp_t) + ur_loader::LibLoader::getFunctionPtr( + GlobalAdapter->processHandle, + "zeCommandListImmediateAppendCommandListsExp"); + ZeCommandListImmediateAppendExt.Supported = + ZeCommandListImmediateAppendExt + .zeCommandListImmediateAppendCommandListsExp != nullptr; + } + return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index 748460158c..0faa122651 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -134,4 +134,11 @@ struct ur_platform_handle_t_ : public _ur_platform { ze_result_t (*zexDeviceReleaseExternalSemaphoreExp)( ze_intel_external_semaphore_exp_handle_t); } ZeExternalSemaphoreExt; -}; \ No newline at end of file + + struct ZeCommandListImmediateAppendExtension { + bool Supported = false; + ze_result_t (*zeCommandListImmediateAppendCommandListsExp)( + ze_command_list_handle_t, uint32_t, ze_command_list_handle_t *, + ze_event_handle_t, uint32_t, ze_event_handle_t *); + } ZeCommandListImmediateAppendExt; +}; From e298ddaf098af03610305da8818c33704c5dc08a Mon Sep 17 00:00:00 2001 From: aarongreig Date: Thu, 26 Dec 2024 16:52:33 +0000 Subject: [PATCH 02/14] Merge pull request #2402 from yingcong-wu/yc/1202-mmap-failure [DevASAN][CPU] bugfix for mmap return value check. --- .../sanitizer/sanitizer_common/linux/sanitizer_utils.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp index df64a72ed7..27fe223a5d 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/sanitizer_common/linux/sanitizer_utils.cpp @@ -40,6 +40,9 @@ uptr MmapNoReserve(uptr Addr, uptr Size) { Addr = RoundDownTo(Addr, EXEC_PAGESIZE); void *P = mmap((void *)Addr, Size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_NORESERVE | MAP_ANONYMOUS, -1, 0); + if (P == MAP_FAILED) { + return 0; + } return (uptr)P; } From 329d4492d3b5425b7e6b2330bf85b92cf30a9eaf Mon Sep 17 00:00:00 2001 From: aarongreig Date: Fri, 27 Dec 2024 17:39:52 +0000 Subject: [PATCH 03/14] Merge pull request #2507 from AllanZyne/review/yang/fix_msan_shadow [DeviceMSAN] Fix MemToShadow algorithm and VA reservation --- .../layers/sanitizer/asan/asan_shadow.cpp | 8 +++- .../layers/sanitizer/msan/msan_shadow.cpp | 39 ++++++++++++------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_shadow.cpp b/source/loader/layers/sanitizer/asan/asan_shadow.cpp index de0679687b..145fd232c1 100644 --- a/source/loader/layers/sanitizer/asan/asan_shadow.cpp +++ b/source/loader/layers/sanitizer/asan/asan_shadow.cpp @@ -104,10 +104,14 @@ ur_result_t ShadowMemoryGPU::Setup() { // shadow memory for each contexts, this will cause out-of-resource error when user uses // multiple contexts. Therefore, we just create one shadow memory here. static ur_result_t Result = [this]() { - size_t ShadowSize = GetShadowSize(); + const size_t ShadowSize = GetShadowSize(); + // To reserve very large amount of GPU virtual memroy, the pStart param should be beyond + // the SVM range, so that GFX driver will automatically switch to reservation on the GPU + // heap. + const void *StartAddress = (void *)(0x100'0000'0000'0000ULL); // TODO: Protect Bad Zone auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( - Context, nullptr, ShadowSize, (void **)&ShadowBegin); + Context, StartAddress, ShadowSize, (void **)&ShadowBegin); if (Result != UR_RESULT_SUCCESS) { getContext()->logger.error( "Shadow memory reserved failed with size {}: {}", diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp index add9813db6..75866203f3 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp @@ -134,18 +134,23 @@ ur_result_t MsanShadowMemoryGPU::Setup() { // shadow memory for each contexts, this will cause out-of-resource error when user uses // multiple contexts. Therefore, we just create one shadow memory here. static ur_result_t Result = [this]() { - size_t ShadowSize = GetShadowSize(); + const size_t ShadowSize = GetShadowSize(); + // To reserve very large amount of GPU virtual memroy, the pStart param should be beyond + // the SVM range, so that GFX driver will automatically switch to reservation on the GPU + // heap. + const void *StartAddress = (void *)(0x100'0000'0000'0000ULL); // TODO: Protect Bad Zone auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( - Context, nullptr, ShadowSize, (void **)&ShadowBegin); - if (Result == UR_RESULT_SUCCESS) { - ShadowEnd = ShadowBegin + ShadowSize; - // Retain the context which reserves shadow memory - getContext()->urDdiTable.Context.pfnRetain(Context); + Context, StartAddress, ShadowSize, (void **)&ShadowBegin); + if (Result != UR_RESULT_SUCCESS) { + getContext()->logger.error( + "Shadow memory reserved failed with size {}: {}", + (void *)ShadowSize, Result); + return Result; } - - // Set shadow memory for null pointer - ManagedQueue Queue(Context, Device); + ShadowEnd = ShadowBegin + ShadowSize; + // Retain the context which reserves shadow memory + getContext()->urDdiTable.Context.pfnRetain(Context); return UR_RESULT_SUCCESS; }(); return Result; @@ -278,13 +283,21 @@ MsanShadowMemoryGPU::ReleaseShadow(std::shared_ptr AI) { } uptr MsanShadowMemoryPVC::MemToShadow(uptr Ptr) { - assert(Ptr & 0xFF00000000000000ULL && "Ptr must be device USM"); - return ShadowBegin + (Ptr & 0x3FFF'FFFF'FFFFULL); + assert(Ptr & 0xff00'0000'0000'0000ULL && "Ptr must be device USM"); + if (Ptr < ShadowBegin) { + return Ptr + (ShadowBegin - 0xff00'0000'0000'0000ULL); + } else { + return Ptr - (0xff00'ffff'ffff'ffffULL - ShadowEnd); + } } uptr MsanShadowMemoryDG2::MemToShadow(uptr Ptr) { - assert(Ptr & 0xFFFF000000000000ULL && "Ptr must be device USM"); - return ShadowBegin + (Ptr & 0x3FFF'FFFF'FFFFULL); + assert(Ptr & 0xffff'0000'0000'0000ULL && "Ptr must be device USM"); + if (Ptr < ShadowBegin) { + return Ptr + (ShadowBegin - 0xffff'8000'0000'0000ULL); + } else { + return Ptr - (0xffff'ffff'ffff'ffffULL - ShadowEnd); + } } } // namespace msan From ff6b9a740dfaee462e63546be95626516fac2321 Mon Sep 17 00:00:00 2001 From: aarongreig Date: Mon, 30 Dec 2024 15:14:19 +0000 Subject: [PATCH 04/14] Merge pull request #2484 from zhaomaosu/move-clean-shadow-to-launchinfo [DevMSAN] Move clean shadow into launch info --- source/loader/layers/sanitizer/msan/msan_libdevice.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp index cd05cfa38c..32e8f36552 100644 --- a/source/loader/layers/sanitizer/msan/msan_libdevice.hpp +++ b/source/loader/layers/sanitizer/msan/msan_libdevice.hpp @@ -52,6 +52,8 @@ struct MsanLaunchInfo { uint32_t IsRecover = 0; MsanErrorReport Report; + + uint8_t CleanShadow[128] = {}; }; // Based on the observation, only the last 24 bits of the address of the private From fe5a3c652543d6216e02d39b025d2627c61a979f Mon Sep 17 00:00:00 2001 From: Martin Grant Date: Fri, 3 Jan 2025 10:08:48 +0000 Subject: [PATCH 05/14] Merge pull request #2501 from winstonzhang-intel/interrupt-based-patch [L0] Fixes potential overwrite in ZeEventPoolDesc.pNext --- source/adapters/level_zero/context.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 169c8ec097..08f4762b6d 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -533,6 +533,13 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( if (*ZePool == nullptr) { ze_event_pool_counter_based_exp_desc_t counterBasedExt = { ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC, nullptr, 0}; + + ze_intel_event_sync_mode_exp_desc_t eventSyncMode = { + ZE_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC, nullptr, 0}; + eventSyncMode.syncModeFlags = + ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT | + ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT; + ZeStruct ZeEventPoolDesc; ZeEventPoolDesc.count = MaxNumEventsPerPool; ZeEventPoolDesc.flags = 0; @@ -552,14 +559,11 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( } logger::debug("ze_event_pool_desc_t counter based flags set to: {}", counterBasedExt.flags); + if (InterruptBasedEventEnabled) { + counterBasedExt.pNext = &eventSyncMode; + } ZeEventPoolDesc.pNext = &counterBasedExt; - } - if (InterruptBasedEventEnabled) { - ze_intel_event_sync_mode_exp_desc_t eventSyncMode = { - ZE_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC, nullptr, 0}; - eventSyncMode.syncModeFlags = - ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT | - ZE_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT; + } else if (InterruptBasedEventEnabled) { ZeEventPoolDesc.pNext = &eventSyncMode; } From 26f30b49bc4d30b294ed654a40fb18b63aa2f8c8 Mon Sep 17 00:00:00 2001 From: Martin Grant Date: Fri, 3 Jan 2025 10:08:59 +0000 Subject: [PATCH 06/14] Merge pull request #2502 from ldrumm/luke/tensormap-version CUTensorMap is only in CUDA v12 --- source/adapters/cuda/tensor_map.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/source/adapters/cuda/tensor_map.cpp b/source/adapters/cuda/tensor_map.cpp index da8e4f8f8c..1730b79d41 100644 --- a/source/adapters/cuda/tensor_map.cpp +++ b/source/adapters/cuda/tensor_map.cpp @@ -13,6 +13,24 @@ #include "context.hpp" +#if CUDA_VERSION < 12000 +UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeIm2ColExp( + ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *, + const uint64_t *, const uint64_t *, const int *, const int *, uint32_t, + uint32_t, const uint32_t *, ur_exp_tensor_map_interleave_flags_t, + ur_exp_tensor_map_swizzle_flags_t, ur_exp_tensor_map_l2_promotion_flags_t, + ur_exp_tensor_map_oob_fill_flags_t, ur_exp_tensor_map_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp( + ur_device_handle_t, ur_exp_tensor_map_data_type_flags_t, uint32_t, void *, + const uint64_t *, const uint64_t *, const uint32_t *, const uint32_t *, + ur_exp_tensor_map_interleave_flags_t, ur_exp_tensor_map_swizzle_flags_t, + ur_exp_tensor_map_l2_promotion_flags_t, ur_exp_tensor_map_oob_fill_flags_t, + ur_exp_tensor_map_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +#else struct ur_exp_tensor_map_handle_t_ { CUtensorMap Map; }; @@ -140,3 +158,4 @@ UR_APIEXPORT ur_result_t UR_APICALL urTensorMapEncodeTiledExp( } return UR_RESULT_SUCCESS; } +#endif From 6e2dd4d4469cef1fb8c423063b8523515928a747 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 6 Jan 2025 10:59:11 +0000 Subject: [PATCH 07/14] Merge pull request #2478 from yingcong-wu/yc/1218-libcxx-mess [DeviceSanitizer] When link with libc++, link the gcc_s first. --- source/loader/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index a10e99f422..931c9dd3ed 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -208,7 +208,8 @@ if(UR_ENABLE_SANITIZER) if(NOT EXISTS ${LIBCXX_PATH} OR NOT EXISTS ${LIBCXX_ABI_PATH}) message(FATAL_ERROR "libc++ is required but can't find the libraries") endif() - target_link_libraries(ur_loader PRIVATE ${LIBCXX_PATH} ${LIBCXX_ABI_PATH}) + # Link with gcc_s fisrt to avoid some symbols resolve to libc++/libc++abi/libunwind's one + target_link_libraries(ur_loader PRIVATE gcc_s ${LIBCXX_PATH} ${LIBCXX_ABI_PATH}) endif() endif() From 29a27404c5b2764e6195b5a1d5c7fb172a142b05 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 6 Jan 2025 11:20:30 +0000 Subject: [PATCH 08/14] Merge pull request #2506 from AllanZyne/review/yang/fix_kernel_native [DeviceASAN] Fix urKernelCreateWithNativeHandle segfault --- .../loader/layers/sanitizer/asan/asan_ddi.cpp | 57 +++++------------ .../sanitizer/asan/asan_interceptor.cpp | 37 ++++++----- .../sanitizer/asan/asan_interceptor.hpp | 12 +--- .../loader/layers/sanitizer/msan/msan_ddi.cpp | 64 ++++--------------- .../sanitizer/msan/msan_interceptor.cpp | 34 +++++++--- .../sanitizer/msan/msan_interceptor.hpp | 18 ++---- 6 files changed, 85 insertions(+), 137 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index bf4dff157a..67dd96c39e 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -1335,28 +1335,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelCreate -__urdlllocal ur_result_t UR_APICALL urKernelCreate( - ur_program_handle_t hProgram, ///< [in] handle of the program instance - const char *pKernelName, ///< [in] pointer to null-terminated string. - ur_kernel_handle_t - *phKernel ///< [out] pointer to handle of kernel object created. -) { - auto pfnCreate = getContext()->urDdiTable.Kernel.pfnCreate; - - if (nullptr == pfnCreate) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - - getContext()->logger.debug("==== urKernelCreate"); - - UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); - UR_CALL(getAsanInterceptor()->insertKernel(*phKernel)); - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelRetain __urdlllocal ur_result_t UR_APICALL urKernelRetain( @@ -1372,8 +1350,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain( UR_CALL(pfnRetain(hKernel)); - auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - KernelInfo->RefCount++; + auto &KernelInfo = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + KernelInfo.RefCount++; return UR_RESULT_SUCCESS; } @@ -1392,9 +1370,9 @@ __urdlllocal ur_result_t urKernelRelease( getContext()->logger.debug("==== urKernelRelease"); UR_CALL(pfnRelease(hKernel)); - auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - if (--KernelInfo->RefCount == 0) { - UR_CALL(getAsanInterceptor()->eraseKernel(hKernel)); + auto &KernelInfo = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + if (--KernelInfo.RefCount == 0) { + UR_CALL(getAsanInterceptor()->eraseKernelInfo(hKernel)); } return UR_RESULT_SUCCESS; @@ -1423,9 +1401,9 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( if (argSize == sizeof(ur_mem_handle_t) && (MemBuffer = getAsanInterceptor()->getMemBuffer( *ur_cast(pArgValue)))) { - auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - std::scoped_lock Guard(KernelInfo->Mutex); - KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + auto &KernelInfo = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo.Mutex); + KernelInfo.BufferArgs[argIndex] = std::move(MemBuffer); } else { UR_CALL( pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue)); @@ -1453,9 +1431,9 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( std::shared_ptr MemBuffer; if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) { - auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - std::scoped_lock Guard(KernelInfo->Mutex); - KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + auto &KernelInfo = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo.Mutex); + KernelInfo.BufferArgs[argIndex] = std::move(MemBuffer); } else { UR_CALL(pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue)); } @@ -1484,12 +1462,12 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( argSize); { - auto KI = getAsanInterceptor()->getKernelInfo(hKernel); - std::scoped_lock Guard(KI->Mutex); + auto &KI = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KI.Mutex); // TODO: get local variable alignment auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal( argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY); - KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ}; + KI.LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ}; argSize = argSizeWithRZ; } @@ -1522,9 +1500,9 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( std::shared_ptr KI; if (getAsanInterceptor()->getOptions().DetectKernelArguments) { - auto KI = getAsanInterceptor()->getKernelInfo(hKernel); - std::scoped_lock Guard(KI->Mutex); - KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()}; + auto &KI = getAsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KI.Mutex); + KI.PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()}; } ur_result_t result = @@ -1708,7 +1686,6 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnCreate = ur_sanitizer_layer::asan::urKernelCreate; pDdiTable->pfnRetain = ur_sanitizer_layer::asan::urKernelRetain; pDdiTable->pfnRelease = ur_sanitizer_layer::asan::urKernelRelease; pDdiTable->pfnSetArgValue = ur_sanitizer_layer::asan::urKernelSetArgValue; diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 31c35201de..72677bff67 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -639,22 +639,26 @@ ur_result_t AsanInterceptor::eraseProgram(ur_program_handle_t Program) { return UR_RESULT_SUCCESS; } -ur_result_t AsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) { - std::scoped_lock Guard(m_KernelMapMutex); - if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { - return UR_RESULT_SUCCESS; +KernelInfo &AsanInterceptor::getOrCreateKernelInfo(ur_kernel_handle_t Kernel) { + { + std::shared_lock Guard(m_KernelMapMutex); + if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { + return *m_KernelMap[Kernel].get(); + } } - auto hProgram = GetProgram(Kernel); - auto PI = getAsanInterceptor()->getProgramInfo(hProgram); + // Create new KernelInfo + auto Program = GetProgram(Kernel); + auto PI = getProgramInfo(Program); bool IsInstrumented = PI->isKernelInstrumented(Kernel); + std::scoped_lock Guard(m_KernelMapMutex); m_KernelMap.emplace(Kernel, - std::make_shared(Kernel, IsInstrumented)); - return UR_RESULT_SUCCESS; + std::make_unique(Kernel, IsInstrumented)); + return *m_KernelMap[Kernel].get(); } -ur_result_t AsanInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { +ur_result_t AsanInterceptor::eraseKernelInfo(ur_kernel_handle_t Kernel) { std::scoped_lock Guard(m_KernelMapMutex); assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); m_KernelMap.erase(Kernel); @@ -691,7 +695,8 @@ ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { - auto KernelInfo = getKernelInfo(Kernel); + auto &KernelInfo = getOrCreateKernelInfo(Kernel); + std::shared_lock Guard(KernelInfo.Mutex); auto ArgNums = GetKernelNumArgs(Kernel); auto LocalMemoryUsage = @@ -703,11 +708,11 @@ ur_result_t AsanInterceptor::prepareLaunch( "KernelInfo {} (Name={}, ArgNums={}, IsInstrumented={}, " "LocalMemory={}, PrivateMemory={})", (void *)Kernel, GetKernelName(Kernel), ArgNums, - KernelInfo->IsInstrumented, LocalMemoryUsage, PrivateMemoryUsage); + KernelInfo.IsInstrumented, LocalMemoryUsage, PrivateMemoryUsage); // Validate pointer arguments if (getOptions().DetectKernelArguments) { - for (const auto &[ArgIndex, PtrPair] : KernelInfo->PointerArgs) { + for (const auto &[ArgIndex, PtrPair] : KernelInfo.PointerArgs) { auto Ptr = PtrPair.first; if (Ptr == nullptr) { continue; @@ -722,7 +727,7 @@ ur_result_t AsanInterceptor::prepareLaunch( } // Set membuffer arguments - for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) { + for (const auto &[ArgIndex, MemBuffer] : KernelInfo.BufferArgs) { char *ArgPointer = nullptr; UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer)); ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer( @@ -735,7 +740,7 @@ ur_result_t AsanInterceptor::prepareLaunch( } } - if (!KernelInfo->IsInstrumented) { + if (!KernelInfo.IsInstrumented) { return UR_RESULT_SUCCESS; } @@ -830,9 +835,9 @@ ur_result_t AsanInterceptor::prepareLaunch( } // Write local arguments info - if (!KernelInfo->LocalArgs.empty()) { + if (!KernelInfo.LocalArgs.empty()) { std::vector LocalArgsInfo; - for (auto [ArgIndex, ArgInfo] : KernelInfo->LocalArgs) { + for (auto [ArgIndex, ArgInfo] : KernelInfo.LocalArgs) { LocalArgsInfo.push_back(ArgInfo); getContext()->logger.debug( "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex, diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index 27d5e37532..eaa5463692 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -308,9 +308,6 @@ class AsanInterceptor { ur_result_t insertProgram(ur_program_handle_t Program); ur_result_t eraseProgram(ur_program_handle_t Program); - ur_result_t insertKernel(ur_kernel_handle_t Kernel); - ur_result_t eraseKernel(ur_kernel_handle_t Kernel); - ur_result_t insertMemBuffer(std::shared_ptr MemBuffer); ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle); std::shared_ptr getMemBuffer(ur_mem_handle_t MemHandle); @@ -350,11 +347,8 @@ class AsanInterceptor { return nullptr; } - std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { - std::shared_lock Guard(m_KernelMapMutex); - assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); - return m_KernelMap[Kernel]; - } + KernelInfo &getOrCreateKernelInfo(ur_kernel_handle_t Kernel); + ur_result_t eraseKernelInfo(ur_kernel_handle_t Kernel); const AsanOptions &getOptions() { return m_Options; } @@ -401,7 +395,7 @@ class AsanInterceptor { m_ProgramMap; ur_shared_mutex m_ProgramMapMutex; - std::unordered_map> + std::unordered_map> m_KernelMap; ur_shared_mutex m_KernelMapMutex; diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 87438a1f99..4b66414bff 100644 --- a/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -50,12 +50,6 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, return UR_RESULT_SUCCESS; } -bool isInstrumentedKernel(ur_kernel_handle_t hKernel) { - auto hProgram = GetProgram(hKernel); - auto PI = getMsanInterceptor()->getProgramInfo(hProgram); - return PI->isKernelInstrumented(hKernel); -} - } // namespace /////////////////////////////////////////////////////////////////////////////// @@ -354,12 +348,6 @@ ur_result_t urEnqueueKernelLaunch( getContext()->logger.debug("==== urEnqueueKernelLaunch"); - if (!isInstrumentedKernel(hKernel)) { - return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); - } - USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); @@ -1155,26 +1143,6 @@ ur_result_t urEnqueueMemUnmap( return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelCreate -ur_result_t urKernelCreate( - ur_program_handle_t hProgram, ///< [in] handle of the program instance - const char *pKernelName, ///< [in] pointer to null-terminated string. - ur_kernel_handle_t - *phKernel ///< [out] pointer to handle of kernel object created. -) { - auto pfnCreate = getContext()->urDdiTable.Kernel.pfnCreate; - - getContext()->logger.debug("==== urKernelCreate"); - - UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); - if (isInstrumentedKernel(*phKernel)) { - UR_CALL(getMsanInterceptor()->insertKernel(*phKernel)); - } - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelRetain ur_result_t urKernelRetain( @@ -1186,10 +1154,8 @@ ur_result_t urKernelRetain( UR_CALL(pfnRetain(hKernel)); - auto KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel); - if (KernelInfo) { - KernelInfo->RefCount++; - } + auto &KernelInfo = getMsanInterceptor()->getOrCreateKernelInfo(hKernel); + KernelInfo.RefCount++; return UR_RESULT_SUCCESS; } @@ -1204,11 +1170,9 @@ ur_result_t urKernelRelease( getContext()->logger.debug("==== urKernelRelease"); UR_CALL(pfnRelease(hKernel)); - auto KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel); - if (KernelInfo) { - if (--KernelInfo->RefCount == 0) { - UR_CALL(getMsanInterceptor()->eraseKernel(hKernel)); - } + auto &KernelInfo = getMsanInterceptor()->getOrCreateKernelInfo(hKernel); + if (--KernelInfo.RefCount == 0) { + UR_CALL(getMsanInterceptor()->eraseKernelInfo(hKernel)); } return UR_RESULT_SUCCESS; @@ -1230,13 +1194,12 @@ ur_result_t urKernelSetArgValue( getContext()->logger.debug("==== urKernelSetArgValue"); std::shared_ptr MemBuffer; - std::shared_ptr KernelInfo; if (argSize == sizeof(ur_mem_handle_t) && (MemBuffer = getMsanInterceptor()->getMemBuffer( - *ur_cast(pArgValue))) && - (KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel))) { - std::scoped_lock Guard(KernelInfo->Mutex); - KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + *ur_cast(pArgValue)))) { + auto &KernelInfo = getMsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo.Mutex); + KernelInfo.BufferArgs[argIndex] = std::move(MemBuffer); } else { UR_CALL( pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue)); @@ -1260,10 +1223,10 @@ ur_result_t urKernelSetArgMemObj( std::shared_ptr MemBuffer; std::shared_ptr KernelInfo; - if ((MemBuffer = getMsanInterceptor()->getMemBuffer(hArgValue)) && - (KernelInfo = getMsanInterceptor()->getKernelInfo(hKernel))) { - std::scoped_lock Guard(KernelInfo->Mutex); - KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + if ((MemBuffer = getMsanInterceptor()->getMemBuffer(hArgValue))) { + auto &KernelInfo = getMsanInterceptor()->getOrCreateKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo.Mutex); + KernelInfo.BufferArgs[argIndex] = std::move(MemBuffer); } else { UR_CALL(pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue)); } @@ -1348,7 +1311,6 @@ ur_result_t urGetKernelProcAddrTable( ) { ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnCreate = ur_sanitizer_layer::msan::urKernelCreate; pDdiTable->pfnRetain = ur_sanitizer_layer::msan::urKernelRetain; pDdiTable->pfnRelease = ur_sanitizer_layer::msan::urKernelRelease; pDdiTable->pfnSetArgValue = ur_sanitizer_layer::msan::urKernelSetArgValue; diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index b9fd9d1ed6..91ea73eeb0 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -298,16 +298,26 @@ ur_result_t MsanInterceptor::eraseProgram(ur_program_handle_t Program) { return UR_RESULT_SUCCESS; } -ur_result_t MsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) { - std::scoped_lock Guard(m_KernelMapMutex); - if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { - return UR_RESULT_SUCCESS; +KernelInfo &MsanInterceptor::getOrCreateKernelInfo(ur_kernel_handle_t Kernel) { + { + std::shared_lock Guard(m_KernelMapMutex); + if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { + return *m_KernelMap[Kernel].get(); + } } - m_KernelMap.emplace(Kernel, std::make_shared(Kernel)); - return UR_RESULT_SUCCESS; + + // Create new KernelInfo + auto Program = GetProgram(Kernel); + auto PI = getProgramInfo(Program); + bool IsInstrumented = PI->isKernelInstrumented(Kernel); + + std::scoped_lock Guard(m_KernelMapMutex); + m_KernelMap.emplace(Kernel, + std::make_unique(Kernel, IsInstrumented)); + return *m_KernelMap[Kernel].get(); } -ur_result_t MsanInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { +ur_result_t MsanInterceptor::eraseKernelInfo(ur_kernel_handle_t Kernel) { std::scoped_lock Guard(m_KernelMapMutex); assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); m_KernelMap.erase(Kernel); @@ -360,10 +370,10 @@ ur_result_t MsanInterceptor::prepareLaunch( }; // Set membuffer arguments - auto KernelInfo = getKernelInfo(Kernel); - assert(KernelInfo && "Kernel must be instrumented"); + auto &KernelInfo = getOrCreateKernelInfo(Kernel); + std::shared_lock Guard(KernelInfo.Mutex); - for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) { + for (const auto &[ArgIndex, MemBuffer] : KernelInfo.BufferArgs) { char *ArgPointer = nullptr; UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer)); ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer( @@ -376,6 +386,10 @@ ur_result_t MsanInterceptor::prepareLaunch( } } + if (!KernelInfo.IsInstrumented) { + return UR_RESULT_SUCCESS; + } + // Set LaunchInfo LaunchInfo.Data->GlobalShadowOffset = DeviceInfo->Shadow->ShadowBegin; LaunchInfo.Data->GlobalShadowOffsetEnd = DeviceInfo->Shadow->ShadowEnd; diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp index 80dbf389a4..579b6355a7 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.hpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp @@ -76,11 +76,15 @@ struct KernelInfo { ur_kernel_handle_t Handle; std::atomic RefCount = 1; + // sanitized kernel + bool IsInstrumented = false; + // lock this mutex if following fields are accessed ur_shared_mutex Mutex; std::unordered_map> BufferArgs; - explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { + explicit KernelInfo(ur_kernel_handle_t Kernel, bool IsInstrumented) + : Handle(Kernel), IsInstrumented(IsInstrumented) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Kernel.pfnRetain(Kernel); assert(Result == UR_RESULT_SUCCESS); @@ -203,9 +207,6 @@ class MsanInterceptor { ur_result_t insertProgram(ur_program_handle_t Program); ur_result_t eraseProgram(ur_program_handle_t Program); - ur_result_t insertKernel(ur_kernel_handle_t Kernel); - ur_result_t eraseKernel(ur_kernel_handle_t Kernel); - ur_result_t insertMemBuffer(std::shared_ptr MemBuffer); ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle); std::shared_ptr getMemBuffer(ur_mem_handle_t MemHandle); @@ -245,13 +246,8 @@ class MsanInterceptor { return m_ProgramMap[Program]; } - std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { - std::shared_lock Guard(m_KernelMapMutex); - if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { - return m_KernelMap[Kernel]; - } - return nullptr; - } + KernelInfo &getOrCreateKernelInfo(ur_kernel_handle_t Kernel); + ur_result_t eraseKernelInfo(ur_kernel_handle_t Kernel); const MsanOptions &getOptions() { return m_Options; } From bf6fc881798723492226fd1807819eb74414ffe5 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 6 Jan 2025 11:21:29 +0000 Subject: [PATCH 09/14] Merge pull request #2508 from AllanZyne/review/yang/fix_msan_empty_kernel [DeviceMSAN] Fix empty kernel --- source/loader/layers/sanitizer/msan/msan_interceptor.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index 91ea73eeb0..3fa8caa073 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -401,8 +401,13 @@ ur_result_t MsanInterceptor::prepareLaunch( (void *)LaunchInfo.Data, LaunchInfo.Data->GlobalShadowOffset, ToString(LaunchInfo.Data->DeviceTy), LaunchInfo.Data->Debug); - UR_CALL( - EnqueueWriteGlobal("__MsanLaunchInfo", &LaunchInfo.Data, sizeof(uptr))); + ur_result_t URes = + EnqueueWriteGlobal("__MsanLaunchInfo", &LaunchInfo.Data, sizeof(uptr)); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.info("EnqueueWriteGlobal(__MsanLaunchInfo) " + "failed, maybe empty kernel: {}", + URes); + } return UR_RESULT_SUCCESS; } From ee6e397141dcff0c9f5240e2e7be912f2a29c761 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 6 Jan 2025 15:24:08 +0000 Subject: [PATCH 10/14] Merge pull request #2513 from AllanZyne/review/yang/fix_msan_usm [DeviceMSAN] Fix "urEnqueueUSM" APIs --- .../loader/layers/sanitizer/msan/msan_ddi.cpp | 258 +++++++++++++++++- .../sanitizer/msan/msan_interceptor.cpp | 63 ++--- .../sanitizer/msan/msan_interceptor.hpp | 20 +- .../layers/sanitizer/msan/msan_shadow.cpp | 185 +++++++------ .../layers/sanitizer/msan/msan_shadow.hpp | 28 +- .../sanitizer_common/sanitizer_utils.cpp | 3 - 6 files changed, 408 insertions(+), 149 deletions(-) diff --git a/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 4b66414bff..2dfeadc358 100644 --- a/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -45,7 +45,6 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, UR_CALL(DI->allocShadowMemory(Context)); } CI->DeviceList.emplace_back(hDevice); - CI->AllocInfosMap[hDevice]; } return UR_RESULT_SUCCESS; } @@ -104,6 +103,17 @@ ur_result_t urUSMDeviceAlloc( pool, size, ppMem); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urUSMFree +__urdlllocal ur_result_t UR_APICALL urUSMFree( + ur_context_handle_t hContext, ///< [in] handle of the context object + void *pMem ///< [in] pointer to USM memory object +) { + getContext()->logger.debug("==== urUSMFree"); + + return getMsanInterceptor()->releaseMemory(hContext, pMem); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramCreateWithIL ur_result_t urProgramCreateWithIL( @@ -1234,6 +1244,247 @@ ur_result_t urKernelSetArgMemObj( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueUSMFill +ur_result_t UR_APICALL urEnqueueUSMFill( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + void *pMem, ///< [in][bounds(0, size)] pointer to USM memory object + size_t + patternSize, ///< [in] the size in bytes of the pattern. Must be a power of 2 and less + ///< than or equal to width. + const void + *pPattern, ///< [in] pointer with the bytes of the pattern to set. + size_t + size, ///< [in] size in bytes to be set. Must be a multiple of patternSize. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. +) { + auto pfnUSMFill = getContext()->urDdiTable.Enqueue.pfnUSMFill; + getContext()->logger.debug("==== urEnqueueUSMFill"); + + ur_event_handle_t hEvents[2] = {}; + UR_CALL(pfnUSMFill(hQueue, pMem, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, &hEvents[0])); + + const auto Mem = (uptr)pMem; + auto MemInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Mem); + if (MemInfoItOp) { + auto MemInfo = (*MemInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(MemInfo->Device); + const auto MemShadow = DeviceInfo->Shadow->MemToShadow(Mem); + + UR_CALL(EnqueueUSMBlockingSet(hQueue, (void *)MemShadow, 0, size, 0, + nullptr, &hEvents[1])); + } + + if (phEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + hQueue, 2, hEvents, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueUSMMemcpy +ur_result_t UR_APICALL urEnqueueUSMMemcpy( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool blocking, ///< [in] blocking or non-blocking copy + void * + pDst, ///< [in][bounds(0, size)] pointer to the destination USM memory object + const void * + pSrc, ///< [in][bounds(0, size)] pointer to the source USM memory object + size_t size, ///< [in] size in bytes to be copied + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. If phEventWaitList and phEvent are not NULL, phEvent + ///< must not refer to an element of the phEventWaitList array. +) { + auto pfnUSMMemcpy = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy; + getContext()->logger.debug("==== pfnUSMMemcpy"); + + ur_event_handle_t hEvents[2] = {}; + UR_CALL(pfnUSMMemcpy(hQueue, blocking, pDst, pSrc, size, + numEventsInWaitList, phEventWaitList, &hEvents[0])); + + const auto Src = (uptr)pSrc, Dst = (uptr)pDst; + auto SrcInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Src); + auto DstInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Dst); + + if (SrcInfoItOp && DstInfoItOp) { + auto SrcInfo = (*SrcInfoItOp)->second; + auto DstInfo = (*DstInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(SrcInfo->Device); + const auto SrcShadow = DeviceInfo->Shadow->MemToShadow(Src); + const auto DstShadow = DeviceInfo->Shadow->MemToShadow(Dst); + + UR_CALL(pfnUSMMemcpy(hQueue, blocking, (void *)DstShadow, + (void *)SrcShadow, size, 0, nullptr, &hEvents[1])); + } else if (DstInfoItOp) { + auto DstInfo = (*DstInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(DstInfo->Device); + auto DstShadow = DeviceInfo->Shadow->MemToShadow(Dst); + + UR_CALL(EnqueueUSMBlockingSet(hQueue, (void *)DstShadow, 0, size, 0, + nullptr, &hEvents[1])); + } + + if (phEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + hQueue, 2, hEvents, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueUSMFill2D +ur_result_t UR_APICALL urEnqueueUSMFill2D( + ur_queue_handle_t hQueue, ///< [in] handle of the queue to submit to. + void * + pMem, ///< [in][bounds(0, pitch * height)] pointer to memory to be filled. + size_t + pitch, ///< [in] the total width of the destination memory including padding. + size_t + patternSize, ///< [in] the size in bytes of the pattern. Must be a power of 2 and less + ///< than or equal to width. + const void + *pPattern, ///< [in] pointer with the bytes of the pattern to set. + size_t + width, ///< [in] the width in bytes of each row to fill. Must be a multiple of + ///< patternSize. + size_t height, ///< [in] the height of the columns to fill. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. +) { + auto pfnUSMFill2D = getContext()->urDdiTable.Enqueue.pfnUSMFill2D; + getContext()->logger.debug("==== urEnqueueUSMFill2D"); + + ur_event_handle_t hEvents[2] = {}; + UR_CALL(pfnUSMFill2D(hQueue, pMem, pitch, patternSize, pPattern, width, + height, numEventsInWaitList, phEventWaitList, + &hEvents[0])); + + const auto Mem = (uptr)pMem; + auto MemInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Mem); + if (MemInfoItOp) { + auto MemInfo = (*MemInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(MemInfo->Device); + const auto MemShadow = DeviceInfo->Shadow->MemToShadow(Mem); + + const char Pattern = 0; + UR_CALL(pfnUSMFill2D(hQueue, (void *)MemShadow, pitch, 1, &Pattern, + width, height, 0, nullptr, &hEvents[1])); + } + + if (phEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + hQueue, 2, hEvents, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueUSMMemcpy2D +ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( + ur_queue_handle_t hQueue, ///< [in] handle of the queue to submit to. + bool blocking, ///< [in] indicates if this operation should block the host. + void * + pDst, ///< [in][bounds(0, dstPitch * height)] pointer to memory where data will + ///< be copied. + size_t + dstPitch, ///< [in] the total width of the source memory including padding. + const void * + pSrc, ///< [in][bounds(0, srcPitch * height)] pointer to memory to be copied. + size_t + srcPitch, ///< [in] the total width of the source memory including padding. + size_t width, ///< [in] the width in bytes of each row to be copied. + size_t height, ///< [in] the height of columns to be copied. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< kernel execution instance. If phEventWaitList and phEvent are not + ///< NULL, phEvent must not refer to an element of the phEventWaitList array. +) { + auto pfnUSMMemcpy2D = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy2D; + getContext()->logger.debug("==== pfnUSMMemcpy2D"); + + ur_event_handle_t hEvents[2] = {}; + UR_CALL(pfnUSMMemcpy2D(hQueue, blocking, pDst, dstPitch, pSrc, srcPitch, + width, height, numEventsInWaitList, phEventWaitList, + &hEvents[0])); + + const auto Src = (uptr)pSrc, Dst = (uptr)pDst; + auto SrcInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Src); + auto DstInfoItOp = getMsanInterceptor()->findAllocInfoByAddress(Dst); + + if (SrcInfoItOp && DstInfoItOp) { + auto SrcInfo = (*SrcInfoItOp)->second; + auto DstInfo = (*DstInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(SrcInfo->Device); + const auto SrcShadow = DeviceInfo->Shadow->MemToShadow(Src); + const auto DstShadow = DeviceInfo->Shadow->MemToShadow(Dst); + + UR_CALL(pfnUSMMemcpy2D(hQueue, blocking, (void *)DstShadow, dstPitch, + (void *)SrcShadow, srcPitch, width, height, 0, + nullptr, &hEvents[1])); + } else if (DstInfoItOp) { + auto DstInfo = (*DstInfoItOp)->second; + + const auto &DeviceInfo = + getMsanInterceptor()->getDeviceInfo(DstInfo->Device); + const auto DstShadow = DeviceInfo->Shadow->MemToShadow(Dst); + + const char Pattern = 0; + UR_CALL(getContext()->urDdiTable.Enqueue.pfnUSMFill2D( + hQueue, (void *)DstShadow, dstPitch, 1, &Pattern, width, height, 0, + nullptr, &hEvents[1])); + } + + if (phEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + hQueue, 2, hEvents, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Global table /// with current process' addresses @@ -1391,6 +1642,10 @@ ur_result_t urGetEnqueueProcAddrTable( pDdiTable->pfnMemUnmap = ur_sanitizer_layer::msan::urEnqueueMemUnmap; pDdiTable->pfnKernelLaunch = ur_sanitizer_layer::msan::urEnqueueKernelLaunch; + pDdiTable->pfnUSMFill = ur_sanitizer_layer::msan::urEnqueueUSMFill; + pDdiTable->pfnUSMMemcpy = ur_sanitizer_layer::msan::urEnqueueUSMMemcpy; + pDdiTable->pfnUSMFill2D = ur_sanitizer_layer::msan::urEnqueueUSMFill2D; + pDdiTable->pfnUSMMemcpy2D = ur_sanitizer_layer::msan::urEnqueueUSMMemcpy2D; return result; } @@ -1408,6 +1663,7 @@ ur_result_t urGetUSMProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; pDdiTable->pfnDeviceAlloc = ur_sanitizer_layer::msan::urUSMDeviceAlloc; + pDdiTable->pfnFree = ur_sanitizer_layer::msan::urUSMFree; return result; } diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index 3fa8caa073..cdaa088297 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -49,8 +49,7 @@ ur_result_t MsanInterceptor::allocateMemory(ur_context_handle_t Context, size_t Size, void **ResultPtr) { auto ContextInfo = getContextInfo(Context); - std::shared_ptr DeviceInfo = - Device ? getDeviceInfo(Device) : nullptr; + std::shared_ptr DeviceInfo = getDeviceInfo(Device); void *Allocated = nullptr; @@ -70,18 +69,32 @@ ur_result_t MsanInterceptor::allocateMemory(ur_context_handle_t Context, AI->print(); - // For updating shadow memory - ContextInfo->insertAllocInfo({Device}, AI); - // For memory release { std::scoped_lock Guard(m_AllocationMapMutex); - m_AllocationMap.emplace(AI->AllocBegin, std::move(AI)); + m_AllocationMap.emplace(AI->AllocBegin, AI); } + ManagedQueue Queue(Context, Device); + DeviceInfo->Shadow->EnqueuePoisonShadow(Queue, AI->AllocBegin, + AI->AllocSize, 0xff); + return UR_RESULT_SUCCESS; } +ur_result_t MsanInterceptor::releaseMemory(ur_context_handle_t Context, + void *Ptr) { + auto Addr = reinterpret_cast(Ptr); + auto AddrInfoItOp = findAllocInfoByAddress(Addr); + + if (AddrInfoItOp) { + std::scoped_lock Guard(m_AllocationMapMutex); + m_AllocationMap.erase(*AddrInfoItOp); + } + + return getContext()->urDdiTable.USM.pfnFree(Context, Ptr); +} + ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, USMLaunchInfo &LaunchInfo) { @@ -98,8 +111,6 @@ ur_result_t MsanInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, UR_CALL(prepareLaunch(DeviceInfo, InternalQueue, Kernel, LaunchInfo)); - UR_CALL(updateShadowMemory(ContextInfo, DeviceInfo, InternalQueue)); - return UR_RESULT_SUCCESS; } @@ -124,29 +135,6 @@ ur_result_t MsanInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, return Result; } -ur_result_t -MsanInterceptor::enqueueAllocInfo(std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue, - std::shared_ptr &AI) { - return DeviceInfo->Shadow->EnqueuePoisonShadow(Queue, AI->AllocBegin, - AI->AllocSize, 0xff); -} - -ur_result_t -MsanInterceptor::updateShadowMemory(std::shared_ptr &ContextInfo, - std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue) { - auto &AllocInfos = ContextInfo->AllocInfosMap[DeviceInfo->Handle]; - std::scoped_lock Guard(AllocInfos.Mutex); - - for (auto &AI : AllocInfos.List) { - UR_CALL(enqueueAllocInfo(DeviceInfo, Queue, AI)); - } - AllocInfos.List.clear(); - - return UR_RESULT_SUCCESS; -} - ur_result_t MsanInterceptor::registerProgram(ur_program_handle_t Program) { ur_result_t Result = UR_RESULT_SUCCESS; @@ -417,13 +405,16 @@ MsanInterceptor::findAllocInfoByAddress(uptr Address) { std::shared_lock Guard(m_AllocationMapMutex); auto It = m_AllocationMap.upper_bound(Address); if (It == m_AllocationMap.begin()) { - return std::optional{}; + return std::nullopt; } --It; - // Make sure we got the right MsanAllocInfo - assert(Address >= It->second->AllocBegin && - Address < It->second->AllocBegin + It->second->AllocSize && - "Wrong MsanAllocInfo for the address"); + + // Since we haven't intercepted all USM APIs, we can't make sure the found AllocInfo is correct. + if (Address < It->second->AllocBegin || + Address >= It->second->AllocBegin + It->second->AllocSize) { + return std::nullopt; + } + return It; } diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.hpp b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp index 579b6355a7..81c237380f 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.hpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.hpp @@ -124,7 +124,6 @@ struct ContextInfo { std::atomic RefCount = 1; std::vector DeviceList; - std::unordered_map AllocInfosMap; explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) { [[maybe_unused]] auto Result = @@ -133,15 +132,6 @@ struct ContextInfo { } ~ContextInfo(); - - void insertAllocInfo(const std::vector &Devices, - std::shared_ptr &AI) { - for (auto Device : Devices) { - auto &AllocInfos = AllocInfosMap[Device]; - std::scoped_lock Guard(AllocInfos.Mutex); - AllocInfos.List.emplace_back(AI); - } - } }; struct USMLaunchInfo { @@ -185,6 +175,7 @@ class MsanInterceptor { const ur_usm_desc_t *Properties, ur_usm_pool_handle_t Pool, size_t Size, void **ResultPtr); + ur_result_t releaseMemory(ur_context_handle_t Context, void *Ptr); ur_result_t registerProgram(ur_program_handle_t Program); ur_result_t unregisterProgram(ur_program_handle_t Program); @@ -259,15 +250,6 @@ class MsanInterceptor { bool isNormalExit() { return m_NormalExit; } private: - ur_result_t - updateShadowMemory(std::shared_ptr &ContextInfo, - std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue); - - ur_result_t enqueueAllocInfo(std::shared_ptr &DeviceInfo, - ur_queue_handle_t Queue, - std::shared_ptr &AI); - /// Initialize Global Variables & Kernel Name at first Launch ur_result_t prepareLaunch(std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp index 75866203f3..2cdf8600d2 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp @@ -111,21 +111,25 @@ uptr MsanShadowMemoryCPU::MemToShadow(uptr Ptr) { return Ptr ^ CPU_SHADOW_MASK; } -ur_result_t MsanShadowMemoryCPU::EnqueuePoisonShadow(ur_queue_handle_t, - uptr Ptr, uptr Size, - u8 Value) { - if (Size == 0) { - return UR_RESULT_SUCCESS; +ur_result_t MsanShadowMemoryCPU::EnqueuePoisonShadow( + ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value, uint32_t NumEvents, + const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent) { + + if (Size) { + const uptr ShadowBegin = MemToShadow(Ptr); + const uptr ShadowEnd = MemToShadow(Ptr + Size - 1); + assert(ShadowBegin <= ShadowEnd); + getContext()->logger.debug( + "EnqueuePoisonShadow(addr={}, count={}, value={})", + (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, + (void *)(size_t)Value); + memset((void *)ShadowBegin, Value, ShadowEnd - ShadowBegin + 1); } - uptr ShadowBegin = MemToShadow(Ptr); - uptr ShadowEnd = MemToShadow(Ptr + Size - 1); - assert(ShadowBegin <= ShadowEnd); - getContext()->logger.debug( - "EnqueuePoisonShadow(addr={}, count={}, value={})", (void *)ShadowBegin, - ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value); - memset((void *)ShadowBegin, Value, ShadowEnd - ShadowBegin + 1); - + if (OutEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + Queue, NumEvents, EventWaitList, OutEvent)); + } return UR_RESULT_SUCCESS; } @@ -169,88 +173,103 @@ ur_result_t MsanShadowMemoryGPU::Destory() { return Result; } -ur_result_t MsanShadowMemoryGPU::EnqueuePoisonShadow(ur_queue_handle_t Queue, - uptr Ptr, uptr Size, - u8 Value) { - if (Size == 0) { - return UR_RESULT_SUCCESS; - } +ur_result_t MsanShadowMemoryGPU::EnqueueMapShadow( + ur_queue_handle_t Queue, uptr Ptr, uptr Size, + std::vector &EventWaitList, + ur_event_handle_t *OutEvent) { - uptr ShadowBegin = MemToShadow(Ptr); - uptr ShadowEnd = MemToShadow(Ptr + Size - 1); + const size_t PageSize = GetVirtualMemGranularity(Context, Device); + + const uptr ShadowBegin = MemToShadow(Ptr); + const uptr ShadowEnd = MemToShadow(Ptr + Size - 1); assert(ShadowBegin <= ShadowEnd); - { - static const size_t PageSize = - GetVirtualMemGranularity(Context, Device); - - ur_physical_mem_properties_t Desc{ - UR_STRUCTURE_TYPE_PHYSICAL_MEM_PROPERTIES, nullptr, 0}; - - // Make sure [Ptr, Ptr + Size] is mapped to physical memory - for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); - MappedPtr <= ShadowEnd; MappedPtr += PageSize) { - std::scoped_lock Guard(VirtualMemMapsMutex); - if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) { - ur_physical_mem_handle_t PhysicalMem{}; - auto URes = getContext()->urDdiTable.PhysicalMem.pfnCreate( - Context, Device, PageSize, &Desc, &PhysicalMem); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("urPhysicalMemCreate(): {}", - URes); - return URes; - } - - URes = getContext()->urDdiTable.VirtualMem.pfnMap( - Context, (void *)MappedPtr, PageSize, PhysicalMem, 0, - UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("urVirtualMemMap({}, {}): {}", - (void *)MappedPtr, PageSize, - URes); - return URes; - } - - getContext()->logger.debug("urVirtualMemMap: {} ~ {}", - (void *)MappedPtr, - (void *)(MappedPtr + PageSize - 1)); - - // Initialize to zero - URes = EnqueueUSMBlockingSet(Queue, (void *)MappedPtr, 0, - PageSize); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("EnqueueUSMBlockingSet(): {}", - URes); - return URes; - } - - VirtualMemMaps[MappedPtr].first = PhysicalMem; + + // Make sure [Ptr, Ptr + Size] is mapped to physical memory + for (auto MappedPtr = RoundDownTo(ShadowBegin, PageSize); + MappedPtr <= ShadowEnd; MappedPtr += PageSize) { + std::scoped_lock Guard(VirtualMemMapsMutex); + if (VirtualMemMaps.find(MappedPtr) == VirtualMemMaps.end()) { + ur_physical_mem_handle_t PhysicalMem{}; + auto URes = getContext()->urDdiTable.PhysicalMem.pfnCreate( + Context, Device, PageSize, nullptr, &PhysicalMem); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("urPhysicalMemCreate(): {}", URes); + return URes; + } + + URes = getContext()->urDdiTable.VirtualMem.pfnMap( + Context, (void *)MappedPtr, PageSize, PhysicalMem, 0, + UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("urVirtualMemMap({}, {}): {}", + (void *)MappedPtr, PageSize, URes); + return URes; } - // We don't need to record virtual memory map for null pointer, - // since it doesn't have an alloc info. - if (Ptr == 0) { - continue; + getContext()->logger.debug("urVirtualMemMap: {} ~ {}", + (void *)MappedPtr, + (void *)(MappedPtr + PageSize - 1)); + + // Initialize to zero + URes = EnqueueUSMBlockingSet(Queue, (void *)MappedPtr, 0, PageSize, + EventWaitList.size(), + EventWaitList.data(), OutEvent); + if (URes != UR_RESULT_SUCCESS) { + getContext()->logger.error("EnqueueUSMSet(): {}", URes); + return URes; } - auto AllocInfoIt = - getMsanInterceptor()->findAllocInfoByAddress(Ptr); - assert(AllocInfoIt); - VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second); + EventWaitList.clear(); + if (OutEvent) { + EventWaitList.push_back(*OutEvent); + } + + VirtualMemMaps[MappedPtr].first = PhysicalMem; + } + + // We don't need to record virtual memory map for null pointer, + // since it doesn't have an alloc info. + if (Ptr == 0) { + continue; + } + + auto AllocInfoIt = getMsanInterceptor()->findAllocInfoByAddress(Ptr); + assert(AllocInfoIt); + VirtualMemMaps[MappedPtr].second.insert((*AllocInfoIt)->second); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t MsanShadowMemoryGPU::EnqueuePoisonShadow( + ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value, uint32_t NumEvents, + const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent) { + if (Size == 0) { + if (OutEvent) { + UR_CALL(getContext()->urDdiTable.Enqueue.pfnEventsWait( + Queue, NumEvents, EventWaitList, OutEvent)); } + return UR_RESULT_SUCCESS; } - auto URes = EnqueueUSMBlockingSet(Queue, (void *)ShadowBegin, Value, - ShadowEnd - ShadowBegin + 1); + std::vector Events(EventWaitList, + EventWaitList + NumEvents); + UR_CALL(EnqueueMapShadow(Queue, Ptr, Size, Events, OutEvent)); + + const uptr ShadowBegin = MemToShadow(Ptr); + const uptr ShadowEnd = MemToShadow(Ptr + Size - 1); + assert(ShadowBegin <= ShadowEnd); + + auto Result = EnqueueUSMBlockingSet(Queue, (void *)ShadowBegin, Value, + ShadowEnd - ShadowBegin + 1, + Events.size(), Events.data(), OutEvent); + getContext()->logger.debug( - "EnqueuePoisonShadow (addr={}, count={}, value={}): {}", + "EnqueuePoisonShadow(addr={}, count={}, value={}): {}", (void *)ShadowBegin, ShadowEnd - ShadowBegin + 1, (void *)(size_t)Value, - URes); - if (URes != UR_RESULT_SUCCESS) { - getContext()->logger.error("EnqueueUSMBlockingSet(): {}", URes); - return URes; - } + Result); - return UR_RESULT_SUCCESS; + return Result; } ur_result_t diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp index de13683cbc..ca5791385c 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.hpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp @@ -32,8 +32,11 @@ struct MsanShadowMemory { virtual uptr MemToShadow(uptr Ptr) = 0; - virtual ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, - uptr Size, u8 Value) = 0; + virtual ur_result_t + EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value, + uint32_t NumEvents = 0, + const ur_event_handle_t *EventWaitList = nullptr, + ur_event_handle_t *OutEvent = nullptr) = 0; virtual ur_result_t ReleaseShadow(std::shared_ptr) { return UR_RESULT_SUCCESS; @@ -74,8 +77,11 @@ struct MsanShadowMemoryCPU final : public MsanShadowMemory { uptr MemToShadow(uptr Ptr) override; - ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, - uptr Size, u8 Value) override; + ur_result_t + EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value, + uint32_t NumEvents = 0, + const ur_event_handle_t *EventWaitList = nullptr, + ur_event_handle_t *OutEvent = nullptr) override; }; struct MsanShadowMemoryGPU : public MsanShadowMemory { @@ -85,19 +91,27 @@ struct MsanShadowMemoryGPU : public MsanShadowMemory { ur_result_t Setup() override; ur_result_t Destory() override; - ur_result_t EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, - uptr Size, u8 Value) override final; + + ur_result_t + EnqueuePoisonShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, u8 Value, + uint32_t NumEvents = 0, + const ur_event_handle_t *EventWaitList = nullptr, + ur_event_handle_t *OutEvent = nullptr) override final; ur_result_t ReleaseShadow(std::shared_ptr AI) override final; virtual size_t GetShadowSize() = 0; - ur_mutex VirtualMemMapsMutex; + private: + ur_result_t EnqueueMapShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, + std::vector &EventWaitList, + ur_event_handle_t *OutEvent); std::unordered_map< uptr, std::pair>>> VirtualMemMaps; + ur_mutex VirtualMemMapsMutex; }; /// Shadow Memory layout of GPU PVC device diff --git a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp index 900eae405b..758e81377f 100644 --- a/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp +++ b/source/loader/layers/sanitizer/sanitizer_common/sanitizer_utils.cpp @@ -247,9 +247,6 @@ ur_result_t EnqueueUSMBlockingSet(ur_queue_handle_t Queue, void *Ptr, char Value, size_t Size, uint32_t NumEvents, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent) { - if (Size == 0) { - return UR_RESULT_SUCCESS; - } return getContext()->urDdiTable.Enqueue.pfnUSMFill( Queue, Ptr, 1, &Value, Size, NumEvents, EventWaitList, OutEvent); } From dd094eb9845f85ec77fb3ba2d8b9890acf32c1ed Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 7 Jan 2025 10:39:26 +0000 Subject: [PATCH 11/14] Merge pull request #2504 from Bensuo/neil-fix [L0] Remove handle Translation in MCL if loader extension used --- source/adapters/level_zero/command_buffer.cpp | 63 +++++++++++++------ source/adapters/level_zero/platform.cpp | 1 + source/adapters/level_zero/platform.hpp | 6 ++ 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index a03a0a7f70..7065c8e167 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -948,41 +948,53 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, auto Platform = CommandBuffer->Context->getPlatform(); auto ZeDevice = CommandBuffer->Device->ZeDevice; + ze_command_list_handle_t ZeCommandList = + CommandBuffer->ZeComputeCommandListTranslated; + if (Platform->ZeMutableCmdListExt.LoaderExtension) { + ZeCommandList = CommandBuffer->ZeComputeCommandList; + } if (NumKernelAlternatives > 0) { ZeMutableCommandDesc.flags |= ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION; - std::vector TranslatedKernelHandles( - NumKernelAlternatives + 1, nullptr); + std::vector KernelHandles(NumKernelAlternatives + 1, + nullptr); ze_kernel_handle_t ZeMainKernel{}; UR_CALL(getZeKernel(ZeDevice, Kernel, &ZeMainKernel)); - // Translate main kernel first - ZE2UR_CALL(zelLoaderTranslateHandle, - (ZEL_HANDLE_KERNEL, ZeMainKernel, - (void **)&TranslatedKernelHandles[0])); + if (Platform->ZeMutableCmdListExt.LoaderExtension) { + KernelHandles[0] = ZeMainKernel; + } else { + // If the L0 loader is not aware of the MCL extension, the main kernel + // handle needs to be translated. + ZE2UR_CALL(zelLoaderTranslateHandle, + (ZEL_HANDLE_KERNEL, ZeMainKernel, (void **)&KernelHandles[0])); + } for (size_t i = 0; i < NumKernelAlternatives; i++) { ze_kernel_handle_t ZeAltKernel{}; UR_CALL(getZeKernel(ZeDevice, KernelAlternatives[i], &ZeAltKernel)); - ZE2UR_CALL(zelLoaderTranslateHandle, - (ZEL_HANDLE_KERNEL, ZeAltKernel, - (void **)&TranslatedKernelHandles[i + 1])); + if (Platform->ZeMutableCmdListExt.LoaderExtension) { + KernelHandles[i + 1] = ZeAltKernel; + } else { + // If the L0 loader is not aware of the MCL extension, the kernel + // alternatives need to be translated. + ZE2UR_CALL(zelLoaderTranslateHandle, (ZEL_HANDLE_KERNEL, ZeAltKernel, + (void **)&KernelHandles[i + 1])); + } } ZE2UR_CALL(Platform->ZeMutableCmdListExt .zexCommandListGetNextCommandIdWithKernelsExp, - (CommandBuffer->ZeComputeCommandListTranslated, - &ZeMutableCommandDesc, NumKernelAlternatives + 1, - TranslatedKernelHandles.data(), &CommandId)); + (ZeCommandList, &ZeMutableCommandDesc, NumKernelAlternatives + 1, + KernelHandles.data(), &CommandId)); } else { ZE2UR_CALL(Platform->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, - (CommandBuffer->ZeComputeCommandListTranslated, - &ZeMutableCommandDesc, &CommandId)); + (ZeCommandList, &ZeMutableCommandDesc, &CommandId)); } DEBUG_LOG(CommandId); @@ -1862,17 +1874,22 @@ ur_result_t updateKernelCommand( ur_kernel_handle_t NewKernel = CommandDesc->hNewKernel; if (NewKernel && Command->Kernel != NewKernel) { + ze_kernel_handle_t KernelHandle{}; ze_kernel_handle_t ZeNewKernel{}; UR_CALL(getZeKernel(ZeDevice, NewKernel, &ZeNewKernel)); - ze_kernel_handle_t ZeKernelTranslated = nullptr; - ZE2UR_CALL(zelLoaderTranslateHandle, - (ZEL_HANDLE_KERNEL, ZeNewKernel, (void **)&ZeKernelTranslated)); + ze_command_list_handle_t ZeCommandList = + CommandBuffer->ZeComputeCommandList; + KernelHandle = ZeNewKernel; + if (!Platform->ZeMutableCmdListExt.LoaderExtension) { + ZeCommandList = CommandBuffer->ZeComputeCommandListTranslated; + ZE2UR_CALL(zelLoaderTranslateHandle, + (ZEL_HANDLE_KERNEL, ZeNewKernel, (void **)&KernelHandle)); + } ZE2UR_CALL(Platform->ZeMutableCmdListExt .zexCommandListUpdateMutableCommandKernelsExp, - (CommandBuffer->ZeComputeCommandListTranslated, 1, - &Command->CommandId, &ZeKernelTranslated)); + (ZeCommandList, 1, &Command->CommandId, &KernelHandle)); // Set current kernel to be the new kernel Command->Kernel = NewKernel; } @@ -2078,9 +2095,15 @@ ur_result_t updateKernelCommand( MutableCommandDesc.pNext = NextDesc; MutableCommandDesc.flags = 0; + ze_command_list_handle_t ZeCommandList = + CommandBuffer->ZeComputeCommandListTranslated; + if (Platform->ZeMutableCmdListExt.LoaderExtension) { + ZeCommandList = CommandBuffer->ZeComputeCommandList; + } + ZE2UR_CALL( Platform->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp, - (CommandBuffer->ZeComputeCommandListTranslated, &MutableCommandDesc)); + (ZeCommandList, &MutableCommandDesc)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index e383e77294..5e093aa646 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -384,6 +384,7 @@ ur_result_t ur_platform_handle_t_::initialize() { ZeMutableCmdListExt.Supported |= ZeMutableCmdListExt.zexCommandListGetNextCommandIdWithKernelsExp != nullptr; + ZeMutableCmdListExt.LoaderExtension = true; } else { ZeMutableCmdListExt.Supported |= (ZE_CALL_NOCHECK( diff --git a/source/adapters/level_zero/platform.hpp b/source/adapters/level_zero/platform.hpp index 0faa122651..1381f51bca 100644 --- a/source/adapters/level_zero/platform.hpp +++ b/source/adapters/level_zero/platform.hpp @@ -96,6 +96,12 @@ struct ur_platform_handle_t_ : public _ur_platform { // associated with particular Level Zero driver, store this extension here. struct ZeMutableCmdListExtension { bool Supported = false; + // If LoaderExtension is true, the L0 loader is aware of the MCL extension. + // If it is false, the extension has to be loaded directly from the driver + // using zeDriverGetExtensionFunctionAddress. If it is loaded directly from + // the driver, any handles passed to it must be translated using + // zelLoaderTranslateHandle. + bool LoaderExtension = false; ze_result_t (*zexCommandListGetNextCommandIdExp)( ze_command_list_handle_t, const ze_mutable_command_id_exp_desc_t *, uint64_t *) = nullptr; From d0dc3f783deb5d0b3fa5cdafe4ef65739a16b7df Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 7 Jan 2025 10:40:15 +0000 Subject: [PATCH 12/14] Merge pull request #2517 from aarongreig/aaron/fixValgrindWarnInLogger Use std::string in create_logger to avoid valgrind warning in some configs --- source/common/logger/ur_logger.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/common/logger/ur_logger.hpp b/source/common/logger/ur_logger.hpp index c4dc655444..786bd32a00 100644 --- a/source/common/logger/ur_logger.hpp +++ b/source/common/logger/ur_logger.hpp @@ -118,16 +118,15 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix, logger::Level default_log_level) { std::transform(logger_name.begin(), logger_name.end(), logger_name.begin(), ::toupper); - std::stringstream env_var_name; const auto default_flush_level = logger::Level::ERR; const std::string default_output = "stderr"; auto level = default_log_level; auto flush_level = default_flush_level; std::unique_ptr sink; - env_var_name << "UR_LOG_" << logger_name; + auto env_var_name = "UR_LOG_" + logger_name; try { - auto map = getenv_to_map(env_var_name.str().c_str()); + auto map = getenv_to_map(env_var_name.c_str()); if (!map.has_value()) { return Logger( default_log_level, @@ -173,7 +172,7 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix, skip_linebreak); } catch (const std::invalid_argument &e) { std::cerr << "Error when creating a logger instance from the '" - << env_var_name.str() << "' environment variable:\n" + << env_var_name << "' environment variable:\n" << e.what() << std::endl; return Logger(default_log_level, std::make_unique( From 6f5023c28d471fbeb89dfb9a06a409e2f5950331 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 7 Jan 2025 10:40:35 +0000 Subject: [PATCH 13/14] Merge pull request #2524 from nrspruit/fix_enqueue_wait_out_event [L0]: Fix Out Event in Enqueue Wait Events to prevent reuse --- source/adapters/level_zero/event.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index a854c50fd9..27557919b5 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -145,10 +145,6 @@ ur_result_t urEnqueueEventsWait( std::unique_lock Lock(Queue->Mutex); resetCommandLists(Queue); } - if (OutEvent && (*OutEvent)->Completed) { - UR_CALL(CleanupCompletedEvent((*OutEvent), false, false)); - UR_CALL(urEventReleaseInternal((*OutEvent))); - } return UR_RESULT_SUCCESS; } @@ -795,7 +791,7 @@ urEventWait(uint32_t NumEvents, ///< [in] number of events in the event list // ur_event_handle_t_ *Event = ur_cast(e); if (!Event->hasExternalRefs()) - die("urEventsWait must not be called for an internal event"); + die("urEventWait must not be called for an internal event"); ze_event_handle_t ZeHostVisibleEvent; if (auto Res = Event->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) From 1026b4734feed02525182cee4ead19b9c3b68ed7 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 7 Jan 2025 15:23:49 +0000 Subject: [PATCH 14/14] Set version to v0.11.3 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dff8615d10..8bf6108bfe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) -project(unified-runtime VERSION 0.11.2) +project(unified-runtime VERSION 0.11.3) # Check if unified runtime is built as a standalone project. if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR UR_STANDALONE_BUILD)