Skip to content

Commit 3e69ce1

Browse files
committed
Add new launch property to support work_group_scratch_memory
#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used. In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp. The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension. Signed-off-by: Victor Lomuller <[email protected]>
1 parent dddb238 commit 3e69ce1

File tree

19 files changed

+251
-103
lines changed

19 files changed

+251
-103
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9560,6 +9560,7 @@ typedef enum ur_exp_launch_property_id_t {
95609560
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95619561
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95629562
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9563+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95639564
/// @cond
95649565
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95659566
/// @endcond
@@ -9573,10 +9574,12 @@ typedef enum ur_exp_launch_property_id_t {
95739574
/// _Analogues_
95749575
/// - **CUlaunchAttributeValue**
95759576
typedef union ur_exp_launch_property_value_t {
9576-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9577-
///< value must be a divisor of the corresponding global work-size
9578-
///< dimension (in units of work-group).
9579-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9577+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9578+
///< value must be a divisor of the corresponding global work-size
9579+
///< dimension (in units of work-group).
9580+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9581+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9582+
///< allocate in bytes
95809583

95819584
} ur_exp_launch_property_value_t;
95829585

@@ -9617,6 +9620,7 @@ typedef struct ur_exp_launch_property_t {
96179620
/// + NULL == hQueue
96189621
/// + NULL == hKernel
96199622
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9623+
/// + `NULL == pGlobalWorkOffset`
96209624
/// + `NULL == pGlobalWorkSize`
96219625
/// + `NULL == launchPropList`
96229626
/// + NULL == pGlobalWorkSize
@@ -9645,6 +9649,8 @@ urEnqueueKernelLaunchCustomExp(
96459649
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96469650
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96479651
///< work-group work-items
9652+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9653+
///< offset used to calculate the global ID of a work-item
96489654
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96499655
///< number of global work-items in workDim that will execute the kernel
96509656
///< function
@@ -11554,6 +11560,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1155411560
ur_queue_handle_t *phQueue;
1155511561
ur_kernel_handle_t *phKernel;
1155611562
uint32_t *pworkDim;
11563+
const size_t **ppGlobalWorkOffset;
1155711564
const size_t **ppGlobalWorkSize;
1155811565
const size_t **ppLocalWorkSize;
1155911566
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470+
const size_t *,
14701471
uint32_t,
14711472
const ur_exp_launch_property_t *,
14721473
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10397,6 +10397,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1039710397
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1039810398
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1039910399
break;
10400+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10401+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10402+
break;
1040010403
default:
1040110404
os << "unknown enumerator";
1040210405
break;
@@ -10433,6 +10436,13 @@ inline ur_result_t printUnion(
1043310436

1043410437
os << (params.cooperative);
1043510438

10439+
break;
10440+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10441+
10442+
os << ".workgroup_mem_size = ";
10443+
10444+
os << (params.workgroup_mem_size);
10445+
1043610446
break;
1043710447
default:
1043810448
os << "<unknown>";
@@ -15100,6 +15110,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1510015110

1510115111
os << *(params->pworkDim);
1510215112

15113+
os << ", ";
15114+
os << ".pGlobalWorkOffset = ";
15115+
15116+
ur::details::printPtr(os,
15117+
*(params->ppGlobalWorkOffset));
15118+
1510315119
os << ", ";
1510415120
os << ".pGlobalWorkSize = ";
1510515121

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
425+
static ur_result_t
426+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427+
uint32_t workDim, const size_t *pGlobalWorkOffset,
428+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429+
uint32_t numEventsInWaitList,
430+
const ur_event_handle_t *phEventWaitList,
431+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
430432
// Preconditions
431433
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
432434
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
444446
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
445447
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
446448

449+
// Set work group memory so we can compute the whole memory requirement
450+
if (WorkGroupMemory)
451+
hKernel->setWorkGroupMemory(WorkGroupMemory);
447452
uint32_t LocalSize = hKernel->getLocalSize();
448453
CUfunction CuFunc = hKernel->get();
449454

@@ -503,6 +508,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
503508
return UR_RESULT_SUCCESS;
504509
}
505510

511+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
512+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
513+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
514+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
515+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
516+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
517+
pGlobalWorkSize, pLocalWorkSize,
518+
numEventsInWaitList, phEventWaitList, phEvent,
519+
/*WorkGroupMemory=*/0);
520+
}
521+
506522
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
507523
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
508524
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -513,8 +529,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
513529
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
514530
coop_prop.value.cooperative = 1;
515531
return urEnqueueKernelLaunchCustomExp(
516-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
517-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
532+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
533+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
534+
phEvent);
518535
}
519536
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520537
pGlobalWorkSize, pLocalWorkSize,
@@ -523,16 +540,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
523540

524541
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
525542
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
526-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
527-
uint32_t numPropsInLaunchPropList,
543+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
544+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
528545
const ur_exp_launch_property_t *launchPropList,
529546
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
530547
ur_event_handle_t *phEvent) {
531548

532-
if (numPropsInLaunchPropList == 0) {
533-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
534-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
535-
phEvent);
549+
size_t WorkGroupMemory = [&]() -> size_t {
550+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
551+
launchPropList, launchPropList + numPropsInLaunchPropList,
552+
[](const ur_exp_launch_property_t &Prop) {
553+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
554+
});
555+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
556+
return WorkGroupMemoryProp->value.workgroup_mem_size;
557+
return 0;
558+
}();
559+
560+
if (numPropsInLaunchPropList == 0 ||
561+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
562+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
563+
pGlobalWorkSize, pLocalWorkSize,
564+
numEventsInWaitList, phEventWaitList, phEvent,
565+
WorkGroupMemory);
536566
}
537567
#if CUDA_VERSION >= 11080
538568
// Preconditions
@@ -545,7 +575,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
545575
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
546576
}
547577

548-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
578+
std::vector<CUlaunchAttribute> launch_attribute;
579+
launch_attribute.reserve(numPropsInLaunchPropList);
549580

550581
// Early exit for zero size kernel
551582
if (*pGlobalWorkSize == 0) {
@@ -558,40 +589,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
558589
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
559590
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
560591

592+
// Set work group memory so we can compute the whole memory requirement
593+
if (WorkGroupMemory)
594+
hKernel->setWorkGroupMemory(WorkGroupMemory);
561595
uint32_t LocalSize = hKernel->getLocalSize();
562596
CUfunction CuFunc = hKernel->get();
563597

564598
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
565599
switch (launchPropList[i].id) {
566600
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
567-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
601+
auto &attr = launch_attribute.emplace_back();
602+
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
568603
break;
569604
}
570605
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
571-
572-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
606+
auto &attr = launch_attribute.emplace_back();
607+
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
573608
// Note that cuda orders from right to left wrt SYCL dimensional order.
574609
if (workDim == 3) {
575-
launch_attribute[i].value.clusterDim.x =
576-
launchPropList[i].value.clusterDim[2];
577-
launch_attribute[i].value.clusterDim.y =
578-
launchPropList[i].value.clusterDim[1];
579-
launch_attribute[i].value.clusterDim.z =
580-
launchPropList[i].value.clusterDim[0];
610+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
611+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
612+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
581613
} else if (workDim == 2) {
582-
launch_attribute[i].value.clusterDim.x =
583-
launchPropList[i].value.clusterDim[1];
584-
launch_attribute[i].value.clusterDim.y =
585-
launchPropList[i].value.clusterDim[0];
586-
launch_attribute[i].value.clusterDim.z =
587-
launchPropList[i].value.clusterDim[2];
614+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
615+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
616+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
588617
} else {
589-
launch_attribute[i].value.clusterDim.x =
590-
launchPropList[i].value.clusterDim[0];
591-
launch_attribute[i].value.clusterDim.y =
592-
launchPropList[i].value.clusterDim[1];
593-
launch_attribute[i].value.clusterDim.z =
594-
launchPropList[i].value.clusterDim[2];
618+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
619+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
620+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
595621
}
596622

597623
UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -600,9 +626,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
600626
break;
601627
}
602628
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
603-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
604-
launch_attribute[i].value.cooperative =
605-
launchPropList[i].value.cooperative;
629+
auto &attr = launch_attribute.emplace_back();
630+
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
631+
attr.value.cooperative = launchPropList[i].value.cooperative;
632+
break;
633+
}
634+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
606635
break;
607636
}
608637
default: {
@@ -615,8 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
615644
// using the standard UR_CHECK_ERROR
616645
if (ur_result_t Ret =
617646
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
618-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
619-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
647+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
648+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
620649
Ret != UR_RESULT_SUCCESS)
621650
return Ret;
622651

@@ -664,7 +693,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
664693
launch_config.sharedMemBytes = LocalSize;
665694
launch_config.hStream = CuStream;
666695
launch_config.attrs = &launch_attribute[0];
667-
launch_config.numAttrs = numPropsInLaunchPropList;
696+
launch_config.numAttrs = launch_attribute.size();
668697

669698
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
670699
const_cast<void **>(ArgIndices.data()),

0 commit comments

Comments
 (0)