diff --git a/.github/intel-llvm-mirror-base-commit b/.github/intel-llvm-mirror-base-commit index 0da183c3c1..aa573797e6 100644 --- a/.github/intel-llvm-mirror-base-commit +++ b/.github/intel-llvm-mirror-base-commit @@ -1 +1 @@ -9debdb88707e85fbf23dcf984e0f2414be5d3172 +77773ad18a8e411af65a089a46d6b44ee0891164 diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 4281f5e280..4d0b1ef5dc 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -70,10 +70,11 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( isInOrder(desc ? desc->isInOrder : false), commandListManager( context, device, - std::forward(commandList), - isInOrder ? v2::EVENT_FLAGS_COUNTER : 0, nullptr, - PoolCacheType::Regular), - context(context), device(device) {} + std::forward(commandList)), + context(context), device(device), + eventPool(context->getEventPoolCache(PoolCacheType::Regular) + .borrow(device->Id.value(), + isInOrder ? v2::EVENT_FLAGS_COUNTER : 0)) {} ur_exp_command_buffer_sync_point_t ur_exp_command_buffer_handle_t_::getSyncPoint(ur_event_handle_t event) { @@ -155,7 +156,6 @@ ur_result_t ur_exp_command_buffer_handle_t_::registerExecutionEventUnlocked( } if (nextExecutionEvent) { currentExecution = nextExecutionEvent; - UR_CALL(nextExecutionEvent->retain()); } return UR_RESULT_SUCCESS; } @@ -202,6 +202,21 @@ ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands( return UR_RESULT_SUCCESS; } + +ur_event_handle_t ur_exp_command_buffer_handle_t_::createEventIfRequested( + ur_exp_command_buffer_sync_point_t *retSyncPoint) { + if (retSyncPoint == nullptr) { + return nullptr; + } + + auto event = eventPool->allocate(); + event->setQueue(nullptr); + + *retSyncPoint = getSyncPoint(event); + + return event; +} + namespace ur::level_zero { ur_result_t @@ -292,18 +307,11 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( } auto eventsWaitList = commandBuffer->getWaitListFromSyncPoints( syncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (retSyncPoint != nullptr) { - event = &signalEvent; - } - UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numSyncPointsInWaitList, eventsWaitList, event)); - if (retSyncPoint != nullptr) { - *retSyncPoint = commandBuffer->getSyncPoint(signalEvent); - } + UR_CALL(commandListLocked->appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, + nullptr, numSyncPointsInWaitList, eventsWaitList, + commandBuffer->createEventIfRequested(retSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -324,17 +332,11 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendUSMMemcpy( - false, pDst, pSrc, size, numSyncPointsInWaitList, eventsWaitList, event)); + false, pDst, pSrc, size, numSyncPointsInWaitList, eventsWaitList, + hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -357,18 +359,11 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendMemBufferCopy( hSrcMem, hDstMem, srcOffset, dstOffset, size, numSyncPointsInWaitList, - eventsWaitList, event)); + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -391,18 +386,11 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } - UR_CALL(commandListLocked->appendMemBufferWrite(hBuffer, false, offset, size, - pSrc, numSyncPointsInWaitList, - eventsWaitList, event)); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } + UR_CALL(commandListLocked->appendMemBufferWrite( + hBuffer, false, offset, size, pSrc, numSyncPointsInWaitList, + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -423,18 +411,11 @@ ur_result_t urCommandBufferAppendMemBufferReadExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } - UR_CALL(commandListLocked->appendMemBufferRead(hBuffer, false, offset, size, - pDst, numSyncPointsInWaitList, - eventsWaitList, event)); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } + UR_CALL(commandListLocked->appendMemBufferRead( + hBuffer, false, offset, size, pDst, numSyncPointsInWaitList, + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -459,19 +440,12 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, - eventsWaitList, event)); + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -496,19 +470,13 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, eventsWaitList, event)); + numSyncPointsInWaitList, eventsWaitList, + hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -533,19 +501,13 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, eventsWaitList, event)); + numSyncPointsInWaitList, eventsWaitList, + hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -565,17 +527,10 @@ ur_result_t urCommandBufferAppendUSMFillExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } - UR_CALL(commandListLocked->appendUSMFill(pMemory, patternSize, pPattern, size, - numSyncPointsInWaitList, - eventsWaitList, event)); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } + + UR_CALL(commandListLocked->appendUSMFill( + pMemory, patternSize, pPattern, size, numSyncPointsInWaitList, + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -596,17 +551,11 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendMemBufferFill( hBuffer, pPattern, patternSize, offset, size, numSyncPointsInWaitList, - eventsWaitList, event)); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } + eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -628,17 +577,11 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendUSMPrefetch( - pMemory, size, flags, numSyncPointsInWaitList, eventsWaitList, event)); + pMemory, size, flags, numSyncPointsInWaitList, eventsWaitList, + hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -658,17 +601,11 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } + UR_CALL(commandListLocked->appendUSMAdvise( - pMemory, size, advice, numSyncPointsInWaitList, eventsWaitList, event)); + pMemory, size, advice, numSyncPointsInWaitList, eventsWaitList, + hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -714,23 +651,17 @@ ur_result_t urCommandBufferAppendNativeCommandExp( auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - ur_event_handle_t *event = nullptr; - ur_event_handle_t signalEvent = nullptr; - if (pSyncPoint != nullptr) { - event = &signalEvent; - } - UR_CALL(commandListLocked->appendBarrier(numSyncPointsInWaitList, - eventsWaitList, nullptr)); + + UR_CALL(commandListLocked->appendEventsWaitWithBarrier( + numSyncPointsInWaitList, eventsWaitList, nullptr)); // Call user-defined function immediately pfnNativeCommand(pData); // Barrier on all commands after user defined commands. - UR_CALL(commandListLocked->appendBarrier(0, nullptr, event)); + UR_CALL(commandListLocked->appendEventsWaitWithBarrier( + 0, nullptr, hCommandBuffer->createEventIfRequested(pSyncPoint))); - if (pSyncPoint != nullptr) { - *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); - } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/v2/command_buffer.hpp b/source/adapters/level_zero/v2/command_buffer.hpp index 155c8c3b4a..697a214bba 100644 --- a/source/adapters/level_zero/v2/command_buffer.hpp +++ b/source/adapters/level_zero/v2/command_buffer.hpp @@ -59,6 +59,9 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t numSyncPointsInWaitList); + ur_event_handle_t + createEventIfRequested(ur_exp_command_buffer_sync_point_t *retSyncPoint); + private: // Stores all sync points that are created by the command buffer. std::vector syncPoints; @@ -77,4 +80,6 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { bool isFinalized = false; ur_event_handle_t currentExecution = nullptr; + + v2::raii::cache_borrowed_event_pool eventPool; }; diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index 44f60c77a5..0a53b777ff 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -12,39 +12,36 @@ #include "../helpers/kernel_helpers.hpp" #include "../helpers/memory_helpers.hpp" #include "../ur_interface_loader.hpp" +#include "command_buffer.hpp" #include "context.hpp" #include "kernel.hpp" #include "memory.hpp" ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, - v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, - ur_queue_t_ *queue, PoolCacheType listType) - : context(context), device(device), zeCommandList(std::move(commandList)), - queue(queue) { - auto &eventPoolTmp = context->getEventPoolCache(listType); - eventPool = eventPoolTmp.borrow(device->Id.value(), flags); + v2::raii::command_list_unique_handle &&commandList) + : hContext(context), hDevice(device), + zeCommandList(std::move(commandList)) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } ur_command_list_manager::~ur_command_list_manager() { - ur::level_zero::urContextRelease(context); - ur::level_zero::urDeviceRelease(device); + ur::level_zero::urContextRelease(hContext); + ur::level_zero::urDeviceRelease(hDevice); } ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_mem_buffer_t *dst, size_t offset, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, zeCommandList.get(), waitListView)); // PatternSize must be a power of two for zeCommandListAppendMemoryFill. @@ -75,18 +72,17 @@ ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, srcOffset, size, - zeCommandList.get(), waitListView)); + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, srcOffset, + size, zeCommandList.get(), waitListView)); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::write_only, dstOffset, + hDevice, ur_mem_buffer_t::device_access_mode_t::write_only, dstOffset, size, zeCommandList.get(), waitListView)); ZE2UR_CALL(zeCommandListAppendMemoryCopy, @@ -105,20 +101,19 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, ur_command_t commandType) { auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, 0, + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, 0, src->getSize(), zeCommandList.get(), waitListView)); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::write_only, 0, + hDevice, ur_mem_buffer_t::device_access_mode_t::write_only, 0, dst->getSize(), zeCommandList.get(), waitListView)); ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, @@ -151,67 +146,108 @@ wait_list_view ur_command_list_manager::getWaitListView( } ze_event_handle_t -ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, +ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType) { if (hUserEvent) { - *hUserEvent = eventPool->allocate(); - (*hUserEvent)->resetQueueAndCommand(queue, commandType); - return (*hUserEvent)->getZeEvent(); + hUserEvent->setCommandType(commandType); + return hUserEvent->getZeEvent(); } else { return nullptr; } } -ur_result_t ur_command_list_manager::appendKernelLaunch( +ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); - + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, + bool cooperative) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device); + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); std::scoped_lock Lock(hKernel->Mutex); ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, device, + UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); - UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], - zeCommandList.get(), waitListView)); + getZeCommandList(), waitListView)); - TRACK_SCOPE_LATENCY( - "ur_command_list_manager::zeCommandListAppendLaunchKernel"); - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions, - zeSignalEvent, waitListView.num, waitListView.handles)); + if (cooperative) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::" + "zeCommandListAppendLaunchCooperativeKernel"); + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + (getZeCommandList(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitListView.num, waitListView.handles)); + } else { + TRACK_SCOPE_LATENCY("ur_command_list_manager::" + "zeCommandListAppendLaunchKernel"); + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (getZeCommandList(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitListView.num, waitListView.handles)); + } + + recordSubmittedKernel(hKernel); postSubmit(hZeKernel, pGlobalWorkOffset); return UR_RESULT_SUCCESS; } +ur_result_t ur_command_list_manager::appendKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); + + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + if (launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && + launchPropList[propIndex].value.cooperative) { + UR_CALL(appendKernelLaunchUnlocked(hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, + phEvent, true /* cooperative */)); + return UR_RESULT_SUCCESS; + } + if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && + launchPropList[propIndex].id != + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + // We don't support any other properties. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + } + + UR_CALL(appendKernelLaunchUnlocked( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, false /* cooperative */)); + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); - auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -229,7 +265,7 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( ur_result_t ur_command_list_manager::appendMemBufferFill( ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); auto hBuffer = hMem->getBuffer(); @@ -245,10 +281,10 @@ ur_result_t ur_command_list_manager::appendMemBufferFill( ur_result_t ur_command_list_manager::appendUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); - ur_usm_handle_t dstHandle(context, size, pMem); + ur_usm_handle_t dstHandle(hContext, size, pMem); return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, phEvent, UR_COMMAND_USM_FILL); @@ -257,11 +293,10 @@ ur_result_t ur_command_list_manager::appendUSMFill( ur_result_t ur_command_list_manager::appendUSMPrefetch( const void *pMem, size_t size, ur_usm_migration_flags_t /*flags*/, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); - auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -283,13 +318,12 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( ur_result_t ur_command_list_manager::appendUSMAdvise( const void *pMem, size_t size, ur_usm_advice_flags_t advice, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); auto zeAdvice = ur_cast(advice); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -299,7 +333,7 @@ ur_result_t ur_command_list_manager::appendUSMAdvise( } ZE2UR_CALL(zeCommandListAppendMemAdvise, - (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice)); + (zeCommandList.get(), hDevice->ZeDevice, pMem, size, zeAdvice)); if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, @@ -308,33 +342,16 @@ ur_result_t ur_command_list_manager::appendUSMAdvise( return UR_RESULT_SUCCESS; } -ur_result_t -ur_command_list_manager::appendBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendBarrier"); - - auto zeSignalEvent = - getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - ur_result_t ur_command_list_manager::appendMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferRead"); auto hBuffer = hMem->getBuffer(); UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - ur_usm_handle_t dstHandle(context, size, pDst); + ur_usm_handle_t dstHandle(hContext, size, pDst); std::scoped_lock lock(hBuffer->getMutex()); @@ -346,13 +363,13 @@ ur_result_t ur_command_list_manager::appendMemBufferRead( ur_result_t ur_command_list_manager::appendMemBufferWrite( ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWrite"); auto hBuffer = hMem->getBuffer(); UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - ur_usm_handle_t srcHandle(context, size, pSrc); + ur_usm_handle_t srcHandle(hContext, size, pSrc); std::scoped_lock lock(hBuffer->getMutex()); @@ -364,7 +381,7 @@ ur_result_t ur_command_list_manager::appendMemBufferWrite( ur_result_t ur_command_list_manager::appendMemBufferCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopy"); auto hBufferSrc = hSrc->getBuffer(); @@ -389,11 +406,11 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect( ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferReadRect"); auto hBuffer = hMem->getBuffer(); - ur_usm_handle_t dstHandle(context, 0, pDst); + ur_usm_handle_t dstHandle(hContext, 0, pDst); std::scoped_lock lock(hBuffer->getMutex()); @@ -409,11 +426,11 @@ ur_result_t ur_command_list_manager::appendMemBufferWriteRect( ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWriteRect"); auto hBuffer = hMem->getBuffer(); - ur_usm_handle_t srcHandle(context, 0, pSrc); + ur_usm_handle_t srcHandle(hContext, 0, pSrc); std::scoped_lock lock(hBuffer->getMutex()); @@ -429,7 +446,7 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect( ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopyRect"); auto hBufferSrc = hSrc->getBuffer(); @@ -447,21 +464,515 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect( ur_result_t ur_command_list_manager::appendUSMMemcpy2D( bool blocking, void *pDst, size_t dstPitch, const void *pSrc, size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy2D"); ur_rect_offset_t zeroOffset{0, 0, 0}; ur_rect_region_t region{width, height, 0}; - ur_usm_handle_t srcHandle(context, 0, pSrc); - ur_usm_handle_t dstHandle(context, 0, pDst); + ur_usm_handle_t srcHandle(hContext, 0, pSrc); + ur_usm_handle_t dstHandle(hContext, 0, pDst); return appendRegionCopyUnlocked(&srcHandle, &dstHandle, blocking, zeroOffset, zeroOffset, region, srcPitch, 0, dstPitch, 0, numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_COPY_RECT); + UR_COMMAND_USM_MEMCPY_2D); +} + +ur_result_t ur_command_list_manager::appendTimestampRecordingExp( + bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendTimestampRecordingExp"); + + if (!phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + phEvent->recordStartTimestamp(); + + auto [timestampPtr, zeSignalEvent] = + (phEvent)->getEventEndTimestampAndHandle(); + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (getZeCommandList(), timestampPtr, zeSignalEvent, numWaitEvents, + pWaitEvents)); + + if (blocking) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, + ur_event_handle_t additionalWaitEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendGenericCommandListsExp"); + + auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); + auto [pWaitEvents, numWaitEvents] = getWaitListView( + phEventWaitList, numEventsInWaitList, additionalWaitEvent); + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (getZeCommandList(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendCommandBufferExp( + ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + + auto bufferCommandListLocked = hCommandBuffer->commandListManager.lock(); + ze_command_list_handle_t commandBufferCommandList = + bufferCommandListLocked->zeCommandList.get(); + + assert(phEvent); + + ur_event_handle_t executionEvent = + hCommandBuffer->getExecutionEventUnlocked(); + + if (executionEvent != nullptr) { + ZE2UR_CALL(zeEventHostSynchronize, + (executionEvent->getZeEvent(), UINT64_MAX)); + } + + UR_CALL(appendGenericCommandListsExp( + 1, &commandBufferCommandList, phEvent, numEventsInWaitList, + phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); + UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(phEvent)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendMemImageRead( + ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageRead"); + + auto hImage = hMem->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_READ); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto [zeImage, zeRegion] = + hImage->getRWRegion(origin, region, rowPitch, slicePitch); + + ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, + (getZeCommandList(), pDst, zeImage, &zeRegion, zeSignalEvent, + waitListView.num, waitListView.handles)); + + if (blockingRead) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendMemImageWrite( + ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); + + auto hImage = hMem->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_WRITE); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto [zeImage, zeRegion] = + hImage->getRWRegion(origin, region, rowPitch, slicePitch); + + ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, + (getZeCommandList(), zeImage, pSrc, &zeRegion, zeSignalEvent, + waitListView.num, waitListView.handles)); + + if (blockingWrite) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendMemImageCopy( + ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); + + auto hImageSrc = hSrc->getImage(); + auto hImageDst = hDst->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, + dstOrigin, region); + + auto [zeImageSrc, zeRegionSrc] = desc.src; + auto [zeImageDst, zeRegionDst] = desc.dst; + + ZE2UR_CALL(zeCommandListAppendImageCopyRegion, + (getZeCommandList(), zeImageDst, zeImageSrc, &zeRegionDst, + &zeRegionSrc, zeSignalEvent, waitListView.num, + waitListView.handles)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendMemBufferMap( + ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, + void **ppRetMap) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferMap"); + + auto hBuffer = hMem->getBuffer(); + + std::scoped_lock lock(hBuffer->getMutex()); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto pDst = ur_cast(hBuffer->mapHostPtr( + mapFlags, offset, size, zeCommandList.get(), waitListView)); + *ppRetMap = pDst; + + if (waitListView) { + // If memory was not migrated, we need to wait on the events here. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), waitListView.num, waitListView.handles)); + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + } + + if (blockingMap) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendMemUnmap( + ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemUnmap"); + + auto hBuffer = hMem->getBuffer(); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + // TODO: currently unmapHostPtr deallocates memory immediately, + // since the memory might be used by the user, we need to make sure + // all dependencies are completed. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), waitListView.num, waitListView.handles)); + waitListView.clear(); + + hBuffer->unmapHostPtr(pMappedPtr, zeCommandList.get(), waitListView); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendUSMFill2D( + void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, + const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +static void *getGlobalPointerFromModule(ze_module_handle_t hModule, + size_t offset, size_t count, + const char *name) { + // Find global variable pointer + size_t globalVarSize = 0; + void *globalVarPtr = nullptr; + ZE2UR_CALL_THROWS(zeModuleGetGlobalPointer, + (hModule, name, &globalVarSize, &globalVarPtr)); + if (globalVarSize < offset + count) { + setErrorMessage("Write device global variable is out of range.", + UR_RESULT_ERROR_INVALID_VALUE, + static_cast(ZE_RESULT_ERROR_INVALID_ARGUMENT)); + throw UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + return globalVarPtr; +} + +ur_result_t ur_command_list_manager::appendDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::appendDeviceGlobalVariableWrite"); + + // TODO: make getZeModuleHandle thread-safe + ze_module_handle_t zeModule = + hProgram->getZeModuleHandle(this->hDevice->ZeDevice); + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + // Locking is done inside appendUSMMemcpy + return appendUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, + pSrc, count, numEventsInWaitList, phEventWaitList, + phEvent); +} + +ur_result_t ur_command_list_manager::appendDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::appendDeviceGlobalVariableRead"); + + // TODO: make getZeModuleHandle thread-safe + ze_module_handle_t zeModule = + hProgram->getZeModuleHandle(this->hDevice->ZeDevice); + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + // Locking is done inside appendUSMMemcpy + return appendUSMMemcpy(blockingRead, pDst, + ur_cast(globalVarPtr) + offset, count, + numEventsInWaitList, phEventWaitList, phEvent); +} + +ur_result_t ur_command_list_manager::appendReadHostPipe( + ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, + bool /*blocking*/, void * /*pDst*/, size_t /*size*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::appendWriteHostPipe( + ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, + bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::appendUSMAllocHelper( + ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t phEvent, ur_usm_type_t type) { + if (!pPool) { + pPool = hContext->getAsyncPool(); + } + + auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice; + + ur_event_handle_t originAllocEvent = nullptr; + auto asyncAlloc = pPool->allocateEnqueued(hContext, Queue, true, device, + nullptr, type, size); + if (!asyncAlloc) { + auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem); + if (Ret) { + return Ret; + } + } else { + std::tie(*ppMem, originAllocEvent) = *asyncAlloc; + } + + auto waitListView = + getWaitListView(phEventWaitList, numEventsInWaitList, originAllocEvent); + + ur_command_t commandType = UR_COMMAND_FORCE_UINT32; + switch (type) { + case UR_USM_TYPE_HOST: + commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP; + break; + case UR_USM_TYPE_DEVICE: + commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP; + break; + case UR_USM_TYPE_SHARED: + commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP; + break; + default: + UR_LOG(ERR, "enqueueUSMAllocHelper: unsupported USM type"); + throw UR_RESULT_ERROR_INVALID_ARGUMENT; + } + + auto zeSignalEvent = getSignalEvent(phEvent, commandType); + auto [pWaitEvents, numWaitEvents] = waitListView; + + if (numWaitEvents > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), numWaitEvents, pWaitEvents)); + } + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + } + if (originAllocEvent) { + originAllocEvent->release(); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendUSMFreeExp( + ur_queue_t_ *Queue, ur_usm_pool_handle_t, void *pMem, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFreeExp"); + assert(phEvent); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_ENQUEUE_USM_FREE_EXP); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem); + if (!hPool) { + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; + } + + UsmPool *usmPool = nullptr; + auto ret = umfPoolGetTag(hPool, (void **)&usmPool); + if (ret != UMF_RESULT_SUCCESS || !usmPool) { + // This should never happen + UR_LOG(ERR, "enqueueUSMFreeExp: invalid pool tag"); + return UR_RESULT_ERROR_UNKNOWN; + } + + size_t size = umfPoolMallocUsableSize(hPool, pMem); + + if (numWaitEvents > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), numWaitEvents, pWaitEvents)); + } + + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + + // Insert must be done after the signal event is appended. + usmPool->asyncPool.insert(pMem, size, phEvent, Queue); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + return bindlessImagesHandleCopyFlags( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, getZeCommandList(), + zeSignalEvent, waitListView.num, waitListView.handles); +} + +ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasWaitValue*/, + uint64_t /*waitValue*/, uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasSignalValue*/, + uint64_t /*signalValue*/, uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::appendNativeCommandExp( + ur_exp_enqueue_native_command_function_t, void *, uint32_t, + const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t) { + UR_LOG_LEGACY( + ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), + "{} function not implemented!", __FUNCTION__); + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +void ur_command_list_manager::recordSubmittedKernel( + ur_kernel_handle_t hKernel) { + submittedKernels.push_back(hKernel); + hKernel->RefCount.increment(); } ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); } + +ur_result_t ur_command_list_manager::appendEventsWait( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWait"); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + if (numWaitEvents > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::appendEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWaitWithBarrier"); + + auto zeSignalEvent = + getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::releaseSubmittedKernels() { + // Free deferred kernels + for (auto &hKernel : submittedKernels) { + UR_CALL(hKernel->release()); + } + submittedKernels.clear(); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index b0c052361c..1f3b7594c3 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -37,12 +37,9 @@ struct wait_list_view { }; struct ur_command_list_manager { - ur_command_list_manager(ur_context_handle_t context, ur_device_handle_t device, - v2::raii::command_list_unique_handle &&commandList, - v2::event_flags_t flags, ur_queue_t_ *queue, - PoolCacheType listType); + v2::raii::command_list_unique_handle &&commandList); ur_command_list_manager(const ur_command_list_manager &src) = delete; ur_command_list_manager(ur_command_list_manager &&src) = default; @@ -52,127 +49,234 @@ struct ur_command_list_manager { ~ur_command_list_manager(); - ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + ze_command_list_handle_t getZeCommandList(); - ur_result_t appendUSMMemcpy(bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, + ur_result_t releaseSubmittedKernels(); + + /************ Generic queue methods *************/ + ur_result_t appendEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t + appendEventsWaitWithBarrier(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + ur_event_handle_t phEvent); ur_result_t appendMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); ur_result_t appendMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferCopy(ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, size_t srcOffset, - size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); ur_result_t appendMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); ur_result_t appendMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); + ur_result_t appendMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); ur_result_t appendMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, - const void *pSrc, size_t srcPitch, size_t width, - size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); + ur_result_t appendMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t + appendMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent, void **ppRetMap); + ur_result_t appendMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); ur_result_t appendUSMFill(void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); + ur_result_t appendUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendUSMFill2D(void *, size_t, size_t, const void *, size_t, + size_t, uint32_t, const ur_event_handle_t *, + ur_event_handle_t); + ur_result_t appendUSMMemcpy2D(bool, void *, size_t, const void *, size_t, + size_t, size_t, uint32_t, + const ur_event_handle_t *, ur_event_handle_t); ur_result_t appendUSMPrefetch(const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - + ur_event_handle_t phEvent); ur_result_t appendUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendCooperativeKernelLaunchExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t + appendTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t + appendCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t appendKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + ur_result_t + appendNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t); - ur_result_t appendBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + ur_result_t appendUSMAllocHelper( + ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t phEvent, ur_usm_type_t type); - ze_command_list_handle_t getZeCommandList(); + ur_result_t appendUSMFreeExp(ur_queue_t_ *Queue, ur_usm_pool_handle_t, + void *pMem, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t phEvent); + +private: + ur_result_t appendGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, + ur_event_handle_t additionalWaitEvent); + + void recordSubmittedKernel(ur_kernel_handle_t hKernel); wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, ur_event_handle_t additionalWaitEvent = nullptr); - ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, + ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType); -private: + ur_result_t appendKernelLaunchUnlocked( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, + bool cooperative); + ur_result_t appendGenericFillUnlocked( ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, ur_command_t commandType); ur_result_t appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, ur_command_t commandType); + ur_event_handle_t phEvent, ur_command_t commandType); ur_result_t appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, ur_command_t commandType); - // UR context associated with this command-buffer - ur_context_handle_t context; - // Device associated with this command-buffer - ur_device_handle_t device; - v2::raii::cache_borrowed_event_pool eventPool; + + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + std::vector submittedKernels; v2::raii::command_list_unique_handle zeCommandList; - ur_queue_t_ *queue; std::vector waitList; }; diff --git a/source/adapters/level_zero/v2/event.cpp b/source/adapters/level_zero/v2/event.cpp index 0b054589c6..30816a9fcd 100644 --- a/source/adapters/level_zero/v2/event.cpp +++ b/source/adapters/level_zero/v2/event.cpp @@ -109,10 +109,8 @@ ur_event_handle_t_::ur_event_handle_t_( : hContext(hContext), event_pool(pool), hZeEvent(std::move(hZeEvent)), flags(flags), profilingData(getZeEvent()) {} -void ur_event_handle_t_::resetQueueAndCommand(ur_queue_t_ *hQueue, - ur_command_t commandType) { +void ur_event_handle_t_::setQueue(ur_queue_t_ *hQueue) { this->hQueue = hQueue; - this->commandType = commandType; if (hQueue) { UR_CALL_THROWS(hQueue->queueGetInfo(UR_QUEUE_INFO_DEVICE, sizeof(hDevice), @@ -125,6 +123,10 @@ void ur_event_handle_t_::resetQueueAndCommand(ur_queue_t_ *hQueue, profilingData.reset(); } +void ur_event_handle_t_::setCommandType(ur_command_t commandType) { + this->commandType = commandType; +} + void ur_event_handle_t_::recordStartTimestamp() { // queue and device must be set before calling this assert(hQueue); diff --git a/source/adapters/level_zero/v2/event.hpp b/source/adapters/level_zero/v2/event.hpp index 6ed0ebccbc..0e9386578a 100644 --- a/source/adapters/level_zero/v2/event.hpp +++ b/source/adapters/level_zero/v2/event.hpp @@ -66,7 +66,8 @@ struct ur_event_handle_t_ : ur_object { const ur_event_native_properties_t *pProperties); // Set the queue and command that this event is associated with - void resetQueueAndCommand(ur_queue_t_ *hQueue, ur_command_t commandType); + void setQueue(ur_queue_t_ *hQueue); + void setCommandType(ur_command_t commandType); void reset(); ze_event_handle_t getZeEvent() const; @@ -100,7 +101,7 @@ struct ur_event_handle_t_ : ur_object { ur_device_handle_t getDevice() const; // Record the start timestamp of the event, to be obtained by - // urEventGetProfilingInfo. resetQueueAndCommand should be + // urEventGetProfilingInfo. setQueue should be // called before this. void recordStartTimestamp(); diff --git a/source/adapters/level_zero/v2/event_pool.cpp b/source/adapters/level_zero/v2/event_pool.cpp index d9639a1a6d..55e8ef0da9 100644 --- a/source/adapters/level_zero/v2/event_pool.cpp +++ b/source/adapters/level_zero/v2/event_pool.cpp @@ -36,7 +36,8 @@ ur_event_handle_t event_pool::allocate() { #ifndef NDEBUG // Set the command type to an invalid value to catch any misuses in tests - event->resetQueueAndCommand(nullptr, UR_COMMAND_FORCE_UINT32); + event->setQueue(nullptr); + event->setCommandType(UR_COMMAND_FORCE_UINT32); #endif return event; diff --git a/source/adapters/level_zero/v2/queue_create.cpp b/source/adapters/level_zero/v2/queue_create.cpp index 60f82bfddb..23259d9c34 100644 --- a/source/adapters/level_zero/v2/queue_create.cpp +++ b/source/adapters/level_zero/v2/queue_create.cpp @@ -15,6 +15,44 @@ #include "queue_handle.hpp" #include "queue_immediate_in_order.hpp" +namespace v2 { + +using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + +static uint32_t getZeOrdinal(ur_device_handle_t hDevice) { + return hDevice->QueueGroup[queue_group_type::Compute].ZeOrdinal; +} + +static std::optional getZeIndex(const ur_queue_properties_t *pProps) { + if (pProps && pProps->pNext) { + const ur_base_properties_t *extendedDesc = + reinterpret_cast(pProps->pNext); + if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { + const ur_queue_index_properties_t *indexProperties = + reinterpret_cast(extendedDesc); + return indexProperties->computeIndex; + } + } + return std::nullopt; +} + +static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { + if ((flags & UR_QUEUE_FLAG_PRIORITY_LOW) != 0) + return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; + if ((flags & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0) + return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; + return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; +} + +static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { + event_flags_t eventFlags = EVENT_FLAGS_COUNTER; + if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) + eventFlags |= EVENT_FLAGS_PROFILING_ENABLED; + return eventFlags; +} + +} // namespace v2 + namespace ur::level_zero { ur_result_t urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, @@ -24,9 +62,17 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, return UR_RESULT_ERROR_INVALID_DEVICE; } - // TODO: For now, always use immediate, in-order + ur_queue_flags_t flags = 0; + if (pProperties) { + flags = pProperties->flags; + } + + auto zeIndex = v2::getZeIndex(pProperties); + *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, pProperties); + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -55,8 +101,19 @@ ur_result_t urQueueCreateWithNativeHandle( } } + auto commandListHandle = v2::raii::command_list_unique_handle( + reinterpret_cast(hNativeQueue), + [ownNativeHandle](ze_command_list_handle_t hZeCommandList) { + if (ownNativeHandle) { + if (checkL0LoaderTeardown()) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); + } + } + }); + *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, hNativeQueue, flags, ownNativeHandle); + hContext, hDevice, std::move(commandListHandle), + v2::eventFlagsFromQueueFlags(flags), flags); return UR_RESULT_SUCCESS; } catch (...) { diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index ad93a60a82..258cd45eb4 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -23,83 +23,30 @@ namespace v2 { -wait_list_view ur_queue_immediate_in_order_t::getWaitListView( - locked &commandList, - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent) { - return commandList->getWaitListView(phWaitEvents, numWaitEvents, - additionalWaitEvent); -} - -static uint32_t getZeOrdinal(ur_device_handle_t hDevice) { - return hDevice->QueueGroup[queue_group_type::Compute].ZeOrdinal; -} - -static std::optional getZeIndex(const ur_queue_properties_t *pProps) { - if (pProps && pProps->pNext) { - const ur_base_properties_t *extendedDesc = - reinterpret_cast(pProps->pNext); - if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { - const ur_queue_index_properties_t *indexProperties = - reinterpret_cast(extendedDesc); - return indexProperties->computeIndex; - } - } - return std::nullopt; -} - -static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { - if ((flags & UR_QUEUE_FLAG_PRIORITY_LOW) != 0) - return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; - if ((flags & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0) - return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; - return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; -} - -static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { - event_flags_t eventFlags = EVENT_FLAGS_COUNTER; - if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) - eventFlags |= EVENT_FLAGS_PROFILING_ENABLED; - return eventFlags; -} - ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) - : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index, + event_flags_t eventFlags, ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), commandListManager( hContext, hDevice, hContext->getCommandListCache().getImmediateCommandList( hDevice->ZeDevice, - {true, getZeOrdinal(hDevice), - true /* always enable copy offload */}, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps)), - eventFlagsFromQueueFlags(flags), this, PoolCacheType::Immediate) {} + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)), + flags(flags), + eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) + .borrow(hDevice->Id.value(), eventFlags)) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue) - : hContext(hContext), hDevice(hDevice), flags(flags), - commandListManager( - hContext, hDevice, - raii::command_list_unique_handle( - reinterpret_cast(hNativeHandle), - [ownZeQueue](ze_command_list_handle_t hZeCommandList) { - if (ownZeQueue) { - if (checkL0LoaderTeardown()) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); - } - } - }), - eventFlagsFromQueueFlags(flags), this, PoolCacheType::Immediate) {} - -ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent( - locked &commandList, ur_event_handle_t *hUserEvent, - ur_command_t commandType) { - return commandList->getSignalEvent(hUserEvent, commandType); -} + raii::command_list_unique_handle commandListHandle, + event_flags_t eventFlags, ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), + commandListManager(hContext, hDevice, std::move(commandListHandle)), + flags(flags), + eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) + .borrow(hDevice->Id.value(), eventFlags)) {} ur_result_t ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, @@ -145,19 +92,17 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { *phNativeQueue = reinterpret_cast( - this->commandListManager.get_no_lock()->getZeCommandList()); + commandListManager.get_no_lock()->getZeCommandList()); return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); - auto commandListLocked = commandListManager.lock(); - // TODO: use zeEventHostSynchronize instead? - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); + auto lockedCommandListManager = commandListManager.lock(); + ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); + (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); hContext->getAsyncPool()->cleanupPoolsForQueue(this); hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { @@ -165,21 +110,11 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { return true; }); - // Free deferred kernels - for (auto &hKernel : submittedKernels) { - UR_CALL(hKernel->release()); - } - submittedKernels.clear(); + UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); return UR_RESULT_SUCCESS; } -void ur_queue_immediate_in_order_t::recordSubmittedKernel( - ur_kernel_handle_t hKernel) { - submittedKernels.push_back(hKernel); - hKernel->RefCount.increment(); -} - ur_result_t ur_queue_immediate_in_order_t::queueFlush() { return UR_RESULT_SUCCESS; } @@ -192,98 +127,11 @@ ur_queue_immediate_in_order_t::~ur_queue_immediate_in_order_t() { } } -ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); - - for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; - propIndex++) { - if (launchPropList[propIndex].id == - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && - launchPropList[propIndex].value.cooperative) { - return enqueueCooperativeKernelLaunchHelper( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); - } - if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && - launchPropList[propIndex].id != - UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { - // We don't support any other properties. - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - } - } - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent)); - - recordSubmittedKernel(hKernel); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueEventsWait"); - - auto commandListLocked = commandListManager.lock(); - if (!numEventsInWaitList && !phEvent) { - // nop - return UR_RESULT_SUCCESS; - } - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_EVENTS_WAIT); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - if (numWaitEvents > 0) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents)); - } - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( +ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY( "ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier"); - - auto commandListLocked = commandListManager.lock(); - if (!numEventsInWaitList && !phEvent) { - // nop - return UR_RESULT_SUCCESS; - } - - auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (commandListLocked->getZeCommandList(), zeSignalEvent, - numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { // For in-order queue we don't need a real barrier, just wait for // requested events in potentially different queues and add a "barrier" // event signal because it is already guaranteed that previous commands @@ -291,768 +139,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { - return enqueueEventsWaitWithBarrierImpl(numEventsInWaitList, - phEventWaitList, phEvent); + return commandListManager.lock()->appendEventsWaitWithBarrier( + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); } else { - return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent); + return commandListManager.lock()->appendEventsWait( + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); } } -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierExt( - const ur_exp_enqueue_ext_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, - phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( - ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferRead"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferRead( - hMem, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWrite( - ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, - const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferWrite"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWrite( - hMem, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferReadRect( - ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t bufferOrigin, - ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, - size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferReadRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferReadRect( - hMem, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect( - ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t bufferOrigin, - ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, - size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWriteRect( - hMem, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopy( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferCopy"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopy( - hSrc, hDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, - size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopyRect( - hSrc, hDst, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, - dstRowPitch, dstSlicePitch, numEventsInWaitList, phEventWaitList, - phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill( - ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferFill( - hMem, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead( - ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageRead"); - - auto hImage = hMem->getImage(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_READ); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto [zeImage, zeRegion] = - hImage->getRWRegion(origin, region, rowPitch, slicePitch); - - ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, - (commandListLocked->getZeCommandList(), pDst, zeImage, &zeRegion, - zeSignalEvent, waitListView.num, waitListView.handles)); - - if (blockingRead) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageWrite( - ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageWrite"); - - auto hImage = hMem->getImage(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_WRITE); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto [zeImage, zeRegion] = - hImage->getRWRegion(origin, region, rowPitch, slicePitch); - - ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, - (commandListLocked->getZeCommandList(), zeImage, pSrc, &zeRegion, - zeSignalEvent, waitListView.num, waitListView.handles)); - - if (blockingWrite) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageCopy( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageWrite"); - - auto hImageSrc = hSrc->getImage(); - auto hImageDst = hDst->getImage(); - - auto commandListLocked = commandListManager.lock(); - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, - dstOrigin, region); - - auto [zeImageSrc, zeRegionSrc] = desc.src; - auto [zeImageDst, zeRegionDst] = desc.dst; - - ZE2UR_CALL(zeCommandListAppendImageCopyRegion, - (commandListLocked->getZeCommandList(), zeImageDst, zeImageSrc, - &zeRegionDst, &zeRegionSrc, zeSignalEvent, waitListView.num, - waitListView.handles)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( - ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - void **ppRetMap) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferMap"); - - auto hBuffer = hMem->getBuffer(); - - std::scoped_lock lock(hBuffer->getMutex()); - - auto commandListLocked = commandListManager.lock(); - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_BUFFER_MAP); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto pDst = ur_cast( - hBuffer->mapHostPtr(mapFlags, offset, size, - commandListLocked->getZeCommandList(), waitListView)); - *ppRetMap = pDst; - - if (waitListView) { - // If memory was not migrated, we need to wait on the events here. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), waitListView.num, - waitListView.handles)); - } - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - - if (blockingMap) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( - ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemUnmap"); - - auto hBuffer = hMem->getBuffer(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_UNMAP); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - // TODO: currently unmapHostPtr deallocates memory immediately, - // since the memory might be used by the user, we need to make sure - // all dependencies are completed. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), waitListView.num, - waitListView.handles)); - waitListView.clear(); - - hBuffer->unmapHostPtr(pMappedPtr, commandListLocked->getZeCommandList(), - waitListView); - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill( - void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMFill(pMem, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( - bool blocking, void *pDst, const void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - // TODO: parametrize latency tracking with 'blocking' - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMMemcpy"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy(blocking, pDst, pSrc, size, - numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( - const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t -ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, 0, nullptr, - phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill2D( - void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, - const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy2D( - bool blocking, void *pDst, size_t dstPitch, const void *pSrc, - size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMMemcpy2D"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -static void *getGlobalPointerFromModule(ze_module_handle_t hModule, - size_t offset, size_t count, - const char *name) { - // Find global variable pointer - size_t globalVarSize = 0; - void *globalVarPtr = nullptr; - ZE2UR_CALL_THROWS(zeModuleGetGlobalPointer, - (hModule, name, &globalVarSize, &globalVarPtr)); - if (globalVarSize < offset + count) { - setErrorMessage("Write device global variable is out of range.", - UR_RESULT_ERROR_INVALID_VALUE, - static_cast(ZE_RESULT_ERROR_INVALID_ARGUMENT)); - throw UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - return globalVarPtr; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableWrite( - ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableWrite"); - - // TODO: make getZeModuleHandle thread-safe - ze_module_handle_t zeModule = - hProgram->getZeModuleHandle(this->hDevice->ZeDevice); - - // Find global variable pointer - auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); - - // Locking is done inside enqueueUSMMemcpy - return enqueueUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, - pSrc, count, numEventsInWaitList, phEventWaitList, - phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableRead( - ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableRead"); - - // TODO: make getZeModuleHandle thread-safe - ze_module_handle_t zeModule = - hProgram->getZeModuleHandle(this->hDevice->ZeDevice); - - // Find global variable pointer - auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); - - // Locking is done inside enqueueUSMMemcpy - return enqueueUSMMemcpy(blockingRead, pDst, - ur_cast(globalVarPtr) + offset, count, - numEventsInWaitList, phEventWaitList, phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueReadHostPipe( - ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, - bool /*blocking*/, void * /*pDst*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe( - ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, - bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper( - ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t *phEvent, ur_usm_type_t type) { - auto commandListLocked = commandListManager.lock(); - - if (!pPool) { - pPool = hContext->getAsyncPool(); - } - - auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice; - - ur_event_handle_t originAllocEvent = nullptr; - auto asyncAlloc = pPool->allocateEnqueued(hContext, this, true, device, - nullptr, type, size); - if (!asyncAlloc) { - auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem); - if (Ret) { - return Ret; - } - } else { - std::tie(*ppMem, originAllocEvent) = *asyncAlloc; - } - - auto waitListView = getWaitListView(commandListLocked, phEventWaitList, - numEventsInWaitList, originAllocEvent); - - ur_command_t commandType = UR_COMMAND_FORCE_UINT32; - switch (type) { - case UR_USM_TYPE_HOST: - commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP; - break; - case UR_USM_TYPE_DEVICE: - commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP; - break; - case UR_USM_TYPE_SHARED: - commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP; - break; - default: - UR_LOG(ERR, "enqueueUSMAllocHelper: unsupported USM type"); - throw UR_RESULT_ERROR_INVALID_ARGUMENT; - } - - auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType); - auto [pWaitEvents, numWaitEvents] = waitListView; - - if (numWaitEvents > 0) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents)); - } - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - if (originAllocEvent) { - originAllocEvent->release(); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp( - ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *pProperties, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp"); - - return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList, - phEventWaitList, ppMem, phEvent, - UR_USM_TYPE_DEVICE); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp( - ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *pProperties, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp"); - - return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList, - phEventWaitList, ppMem, phEvent, - UR_USM_TYPE_SHARED); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp( - ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *pProperties, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp"); - - return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList, - phEventWaitList, ppMem, phEvent, - UR_USM_TYPE_HOST); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp( - ur_usm_pool_handle_t, void *pMem, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFreeExp"); - auto commandListLocked = commandListManager.lock(); - ur_event_handle_t internalEvent = nullptr; - if (phEvent == nullptr) { - phEvent = &internalEvent; - } - - auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, - UR_COMMAND_ENQUEUE_USM_FREE_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem); - if (!hPool) { - return UR_RESULT_ERROR_INVALID_MEM_OBJECT; - } - - UsmPool *usmPool = nullptr; - auto ret = umfPoolGetTag(hPool, (void **)&usmPool); - if (ret != UMF_RESULT_SUCCESS || !usmPool) { - // This should never happen - UR_LOG(ERR, "enqueueUSMFreeExp: invalid pool tag"); - return UR_RESULT_ERROR_UNKNOWN; - } - - size_t size = umfPoolMallocUsableSize(hPool, pMem); - if (internalEvent == nullptr) { - // When the output event is used instead of an internal event, we need to - // increment the refcount. - (*phEvent)->RefCount.increment(); - } - - if (numWaitEvents > 0) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents)); - } - - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - - // Insert must be done after the signal event is appended. - usmPool->asyncPool.insert(pMem, size, *phEvent, this); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp( - const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, - const ur_image_desc_t *pDstImageDesc, - const ur_image_format_t *pSrcImageFormat, - const ur_image_format_t *pDstImageFormat, - ur_exp_image_copy_region_t *pCopyRegion, - ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - auto commandListMgr = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListMgr, phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = - getWaitListView(commandListMgr, phEventWaitList, numEventsInWaitList); - - return bindlessImagesHandleCopyFlags( - pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, - pDstImageFormat, pCopyRegion, imageCopyFlags, - commandListMgr->getZeCommandList(), zeSignalEvent, waitListView.num, - waitListView.handles); -} - -ur_result_t -ur_queue_immediate_in_order_t::bindlessImagesWaitExternalSemaphoreExp( - ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasWaitValue*/, - uint64_t /*waitValue*/, uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -ur_queue_immediate_in_order_t::bindlessImagesSignalExternalSemaphoreExp( - ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasSignalValue*/, - uint64_t /*signalValue*/, uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchHelper( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); - - std::scoped_lock Lock(hKernel->Mutex); - - auto commandListLocked = commandListManager.lock(); - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_KERNEL_LAUNCH); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - UR_CALL(hKernel->prepareForSubmission( - hContext, hDevice, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], - commandListLocked->getZeCommandList(), waitListView)); - - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" - "zeCommandListAppendLaunchCooperativeKernel"); - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (commandListLocked->getZeCommandList(), hZeKernel, - &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, - waitListView.handles)); - - recordSubmittedKernel(hKernel); - - postSubmit(hZeKernel, pGlobalWorkOffset); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( - bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp"); - - auto commandListLocked = commandListManager.lock(); - if (!phEvent) { - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - } - getSignalEvent(commandListLocked, phEvent, - UR_COMMAND_TIMESTAMP_RECORDING_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - (*phEvent)->recordStartTimestamp(); - - auto [timestampPtr, zeSignalEvent] = - (*phEvent)->getEventEndTimestampAndHandle(); - - ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (commandListLocked->getZeCommandList(), timestampPtr, - zeSignalEvent, numWaitEvents, pWaitEvents)); - - if (blocking) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp"); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, callerCommand); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList, - additionalWaitEvent); - - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (commandListLocked->getZeCommandList(), numCommandLists, - phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp( - ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - auto commandListLocked = hCommandBuffer->commandListManager.lock(); - ze_command_list_handle_t commandBufferCommandList = - commandListLocked->getZeCommandList(); - ur_event_handle_t internalEvent = nullptr; - if (phEvent == nullptr) { - phEvent = &internalEvent; - } - ur_event_handle_t executionEvent = - hCommandBuffer->getExecutionEventUnlocked(); - - if (executionEvent != nullptr) { - ZE2UR_CALL(zeEventHostSynchronize, - (executionEvent->getZeEvent(), UINT64_MAX)); - } - - UR_CALL(enqueueGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, nullptr)); - UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent)); - if (internalEvent != nullptr) { - internalEvent->release(); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueNativeCommandExp( - ur_exp_enqueue_native_command_function_t, void *, uint32_t, - const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - UR_LOG_LEGACY( - ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), - "{} function not implemented!", __FUNCTION__); - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} } // namespace v2 diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 2e3ae8c59c..d7d879d9df 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -25,64 +25,50 @@ namespace v2 { -using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; - -struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { +struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { private: ur_context_handle_t hContext; ur_device_handle_t hDevice; - ur_queue_flags_t flags; - lockable commandListManager; - std::vector submittedKernels; - - wait_list_view - getWaitListView(locked &commandList, - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent = nullptr); - - ze_event_handle_t getSignalEvent(locked &commandList, - ur_event_handle_t *hUserEvent, - ur_command_t commandType); - - ur_result_t enqueueGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType); + ur_queue_flags_t flags; + v2::raii::cache_borrowed_event_pool eventPool; - ur_result_t enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent); + // Only create an event when requested by the user. + ur_event_handle_t createEventIfRequested(ur_event_handle_t *phEvent) { + if (phEvent == nullptr) { + return nullptr; + } - ur_result_t - enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + (*phEvent) = eventPool->allocate(); + (*phEvent)->setQueue(this); + return (*phEvent); + } - void recordSubmittedKernel(ur_kernel_handle_t hKernel); + // Always creates an event (used in functions that need to store the event + // internally). If event was requested by the user, also increase ref count of + // that event to avoid pre-mature release. + ur_event_handle_t createEventAndRetain(ur_event_handle_t *phEvent) { + auto hEvent = eventPool->allocate(); + hEvent->setQueue(this); - inline ur_result_t enqueueCooperativeKernelLaunchHelper( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + if (phEvent) { + (*phEvent) = hEvent; + hEvent->retain(); + } - ur_result_t - enqueueUSMAllocHelper(ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *pProperties, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t *phEvent, ur_usm_type_t Type); + return hEvent; + } public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, - const ur_queue_properties_t *); + uint32_t ordinal, + ze_command_queue_priority_t priority, + std::optional index, + event_flags_t eventFlags, + ur_queue_flags_t flags); ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, - ur_native_handle_t, ur_queue_flags_t, - bool ownZeQueue); + raii::command_list_unique_handle, event_flags_t, + ur_queue_flags_t); ~ur_queue_immediate_in_order_t(); @@ -98,158 +84,308 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } ur_result_t enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; - ur_result_t enqueueEventsWaitWithBarrierExt( - const ur_exp_enqueue_ext_properties_t *pProperties, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendEventsWait( + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferRead( + hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferWrite( + hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferReadRect( + hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferWriteRect( + hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemImageRead( + hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemImageWrite( + hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - void **ppRetMap) override; + void **ppRetMap) override { + return commandListManager.lock()->appendMemBufferMap( + hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent), ppRetMap); + } + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendMemUnmap( + hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent)); + } + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMFill( + pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent)); + } + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMFill2D(void *, size_t, size_t, const void *, size_t, - size_t, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; - ur_result_t enqueueUSMMemcpy2D(bool, void *, size_t, const void *, size_t, - size_t, size_t, uint32_t, - const ur_event_handle_t *, - ur_event_handle_t *) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMMemcpy( + blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent)); + } + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMMemcpy2D( + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent)); + } + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMAdvise( + pMem, size, advice, 0, nullptr, createEventIfRequested(phEvent)); + } + ur_result_t enqueueDeviceGlobalVariableWrite( ur_program_handle_t hProgram, const char *name, bool blockingWrite, size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendDeviceGlobalVariableWrite( + hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueDeviceGlobalVariableRead( ur_program_handle_t hProgram, const char *name, bool blockingRead, size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendDeviceGlobalVariableRead( + hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pDst, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendReadHostPipe( + hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendWriteHostPipe( + hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueUSMDeviceAllocExp( ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) override; + void **ppMem, ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_DEVICE); + } + ur_result_t enqueueUSMSharedAllocExp( ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) override; + void **ppMem, ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_SHARED); + } + ur_result_t enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_HOST); + } + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendUSMFreeExp( + this, pPool, pMem, numEventsInWaitList, phEventWaitList, + createEventAndRetain(phEvent)); + } + ur_result_t bindlessImagesImageCopyExp( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, @@ -258,32 +394,62 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, + phEventWaitList, createEventIfRequested(phEvent)); + } + ur_result_t enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendTimestampRecordingExp( + blocking, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent)); + } + ur_result_t enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, - uint32_t, const ur_mem_handle_t *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendCommandBufferExp( + hCommandBuffer, numEventsInWaitList, phEventWaitList, + createEventAndRetain(phEvent)); + } + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->appendNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + } }; } // namespace v2 diff --git a/test/adapters/level_zero/v2/CMakeLists.txt b/test/adapters/level_zero/v2/CMakeLists.txt index 956c8fb147..bd57d78459 100644 --- a/test/adapters/level_zero/v2/CMakeLists.txt +++ b/test/adapters/level_zero/v2/CMakeLists.txt @@ -58,7 +58,6 @@ add_l0_v2_devices_test(event_pool ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_counter.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_normal.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event.cpp - ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/queue_api.cpp ${PROJECT_SOURCE_DIR}/source/ur/ur.cpp ) diff --git a/test/adapters/level_zero/v2/event_pool_test.cpp b/test/adapters/level_zero/v2/event_pool_test.cpp index 8da8afc2f1..2de31b8308 100644 --- a/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/test/adapters/level_zero/v2/event_pool_test.cpp @@ -182,7 +182,8 @@ TEST_P(EventPoolTest, Basic) { auto pool = cache->borrow(device->Id.value(), getParam().flags); first = pool->allocate(); - first->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + first->setQueue(nullptr); + first->setCommandType(UR_COMMAND_KERNEL_LAUNCH); zeFirst = first->getZeEvent(); urEventRelease(first); @@ -193,7 +194,8 @@ TEST_P(EventPoolTest, Basic) { auto pool = cache->borrow(device->Id.value(), getParam().flags); second = pool->allocate(); - first->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + second->setQueue(nullptr); + second->setCommandType(UR_COMMAND_KERNEL_LAUNCH); zeSecond = second->getZeEvent(); urEventRelease(second); @@ -213,8 +215,8 @@ TEST_P(EventPoolTest, Threaded) { std::vector events; for (int i = 0; i < 100; ++i) { events.push_back(pool->allocate()); - events.back()->resetQueueAndCommand(&queue->get(), - UR_COMMAND_KERNEL_LAUNCH); + events.back()->setQueue(nullptr); + events.back()->setCommandType(UR_COMMAND_KERNEL_LAUNCH); } for (int i = 0; i < 100; ++i) { urEventRelease(events[i]); @@ -233,7 +235,8 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { std::list events; for (int i = 0; i < 128; ++i) { auto event = pool->allocate(); - event->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + event->setQueue(nullptr); + event->setCommandType(UR_COMMAND_KERNEL_LAUNCH); events.push_back(event); } auto frontZeHandle = events.front()->getZeEvent(); @@ -243,7 +246,8 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { } for (int i = 0; i < 8; ++i) { auto e = pool->allocate(); - e->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + e->setQueue(nullptr); + e->setCommandType(UR_COMMAND_KERNEL_LAUNCH); events.push_back(e); }