Skip to content

Commit 5acc824

Browse files
authored
Merge pull request intel#2403 from Naghasan/work_group_static_reland
Add new launch property to support work_group_scratch_memory
2 parents e7ee297 + 41ad797 commit 5acc824

File tree

19 files changed

+251
-103
lines changed

19 files changed

+251
-103
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9563,6 +9563,7 @@ typedef enum ur_exp_launch_property_id_t {
95639563
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95649564
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95659565
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9566+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95669567
/// @cond
95679568
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95689569
/// @endcond
@@ -9576,10 +9577,12 @@ typedef enum ur_exp_launch_property_id_t {
95769577
/// _Analogues_
95779578
/// - **CUlaunchAttributeValue**
95789579
typedef union ur_exp_launch_property_value_t {
9579-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9580-
///< value must be a divisor of the corresponding global work-size
9581-
///< dimension (in units of work-group).
9582-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9580+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9581+
///< value must be a divisor of the corresponding global work-size
9582+
///< dimension (in units of work-group).
9583+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9584+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9585+
///< allocate in bytes
95839586

95849587
} ur_exp_launch_property_value_t;
95859588

@@ -9620,6 +9623,7 @@ typedef struct ur_exp_launch_property_t {
96209623
/// + NULL == hQueue
96219624
/// + NULL == hKernel
96229625
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9626+
/// + `NULL == pGlobalWorkOffset`
96239627
/// + `NULL == pGlobalWorkSize`
96249628
/// + `NULL == launchPropList`
96259629
/// + NULL == pGlobalWorkSize
@@ -9648,6 +9652,8 @@ urEnqueueKernelLaunchCustomExp(
96489652
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96499653
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96509654
///< work-group work-items
9655+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9656+
///< offset used to calculate the global ID of a work-item
96519657
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96529658
///< number of global work-items in workDim that will execute the kernel
96539659
///< function
@@ -11557,6 +11563,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1155711563
ur_queue_handle_t *phQueue;
1155811564
ur_kernel_handle_t *phKernel;
1155911565
uint32_t *pworkDim;
11566+
const size_t **ppGlobalWorkOffset;
1156011567
const size_t **ppGlobalWorkSize;
1156111568
const size_t **ppLocalWorkSize;
1156211569
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470+
const size_t *,
14701471
uint32_t,
14711472
const ur_exp_launch_property_t *,
14721473
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10412,6 +10412,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1041210412
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1041310413
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1041410414
break;
10415+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10416+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10417+
break;
1041510418
default:
1041610419
os << "unknown enumerator";
1041710420
break;
@@ -10448,6 +10451,13 @@ inline ur_result_t printUnion(
1044810451

1044910452
os << (params.cooperative);
1045010453

10454+
break;
10455+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10456+
10457+
os << ".workgroup_mem_size = ";
10458+
10459+
os << (params.workgroup_mem_size);
10460+
1045110461
break;
1045210462
default:
1045310463
os << "<unknown>";
@@ -15115,6 +15125,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1511515125

1511615126
os << *(params->pworkDim);
1511715127

15128+
os << ", ";
15129+
os << ".pGlobalWorkOffset = ";
15130+
15131+
ur::details::printPtr(os,
15132+
*(params->ppGlobalWorkOffset));
15133+
1511815134
os << ", ";
1511915135
os << ".pGlobalWorkSize = ";
1512015136

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
425+
static ur_result_t
426+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427+
uint32_t workDim, const size_t *pGlobalWorkOffset,
428+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429+
uint32_t numEventsInWaitList,
430+
const ur_event_handle_t *phEventWaitList,
431+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
430432
// Preconditions
431433
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
432434
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
444446
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
445447
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
446448

449+
// Set work group memory so we can compute the whole memory requirement
450+
if (WorkGroupMemory)
451+
hKernel->setWorkGroupMemory(WorkGroupMemory);
447452
uint32_t LocalSize = hKernel->getLocalSize();
448453
CUfunction CuFunc = hKernel->get();
449454

@@ -503,6 +508,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
503508
return UR_RESULT_SUCCESS;
504509
}
505510

511+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
512+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
513+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
514+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
515+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
516+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
517+
pGlobalWorkSize, pLocalWorkSize,
518+
numEventsInWaitList, phEventWaitList, phEvent,
519+
/*WorkGroupMemory=*/0);
520+
}
521+
506522
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
507523
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
508524
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -513,8 +529,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
513529
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
514530
coop_prop.value.cooperative = 1;
515531
return urEnqueueKernelLaunchCustomExp(
516-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
517-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
532+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
533+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
534+
phEvent);
518535
}
519536
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520537
pGlobalWorkSize, pLocalWorkSize,
@@ -523,16 +540,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
523540

524541
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
525542
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
526-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
527-
uint32_t numPropsInLaunchPropList,
543+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
544+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
528545
const ur_exp_launch_property_t *launchPropList,
529546
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
530547
ur_event_handle_t *phEvent) {
531548

532-
if (numPropsInLaunchPropList == 0) {
533-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
534-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
535-
phEvent);
549+
size_t WorkGroupMemory = [&]() -> size_t {
550+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
551+
launchPropList, launchPropList + numPropsInLaunchPropList,
552+
[](const ur_exp_launch_property_t &Prop) {
553+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
554+
});
555+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
556+
return WorkGroupMemoryProp->value.workgroup_mem_size;
557+
return 0;
558+
}();
559+
560+
if (numPropsInLaunchPropList == 0 ||
561+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
562+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
563+
pGlobalWorkSize, pLocalWorkSize,
564+
numEventsInWaitList, phEventWaitList, phEvent,
565+
WorkGroupMemory);
536566
}
537567
#if CUDA_VERSION >= 11080
538568
// Preconditions
@@ -545,7 +575,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
545575
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
546576
}
547577

548-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
578+
std::vector<CUlaunchAttribute> launch_attribute;
579+
launch_attribute.reserve(numPropsInLaunchPropList);
549580

550581
// Early exit for zero size kernel
551582
if (*pGlobalWorkSize == 0) {
@@ -558,40 +589,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
558589
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
559590
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
560591

592+
// Set work group memory so we can compute the whole memory requirement
593+
if (WorkGroupMemory)
594+
hKernel->setWorkGroupMemory(WorkGroupMemory);
561595
uint32_t LocalSize = hKernel->getLocalSize();
562596
CUfunction CuFunc = hKernel->get();
563597

564598
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
565599
switch (launchPropList[i].id) {
566600
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
567-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
601+
auto &attr = launch_attribute.emplace_back();
602+
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
568603
break;
569604
}
570605
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
571-
572-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
606+
auto &attr = launch_attribute.emplace_back();
607+
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
573608
// Note that cuda orders from right to left wrt SYCL dimensional order.
574609
if (workDim == 3) {
575-
launch_attribute[i].value.clusterDim.x =
576-
launchPropList[i].value.clusterDim[2];
577-
launch_attribute[i].value.clusterDim.y =
578-
launchPropList[i].value.clusterDim[1];
579-
launch_attribute[i].value.clusterDim.z =
580-
launchPropList[i].value.clusterDim[0];
610+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
611+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
612+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
581613
} else if (workDim == 2) {
582-
launch_attribute[i].value.clusterDim.x =
583-
launchPropList[i].value.clusterDim[1];
584-
launch_attribute[i].value.clusterDim.y =
585-
launchPropList[i].value.clusterDim[0];
586-
launch_attribute[i].value.clusterDim.z =
587-
launchPropList[i].value.clusterDim[2];
614+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
615+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
616+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
588617
} else {
589-
launch_attribute[i].value.clusterDim.x =
590-
launchPropList[i].value.clusterDim[0];
591-
launch_attribute[i].value.clusterDim.y =
592-
launchPropList[i].value.clusterDim[1];
593-
launch_attribute[i].value.clusterDim.z =
594-
launchPropList[i].value.clusterDim[2];
618+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
619+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
620+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
595621
}
596622

597623
UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -600,9 +626,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
600626
break;
601627
}
602628
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
603-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
604-
launch_attribute[i].value.cooperative =
605-
launchPropList[i].value.cooperative;
629+
auto &attr = launch_attribute.emplace_back();
630+
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
631+
attr.value.cooperative = launchPropList[i].value.cooperative;
632+
break;
633+
}
634+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
606635
break;
607636
}
608637
default: {
@@ -615,8 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
615644
// using the standard UR_CHECK_ERROR
616645
if (ur_result_t Ret =
617646
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
618-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
619-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
647+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
648+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
620649
Ret != UR_RESULT_SUCCESS)
621650
return Ret;
622651

@@ -664,7 +693,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
664693
launch_config.sharedMemBytes = LocalSize;
665694
launch_config.hStream = CuStream;
666695
launch_config.attrs = &launch_attribute[0];
667-
launch_config.numAttrs = numPropsInLaunchPropList;
696+
launch_config.numAttrs = launch_attribute.size();
668697

669698
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
670699
const_cast<void **>(ArgIndices.data()),

0 commit comments

Comments
 (0)