Skip to content

Commit 7222f79

Browse files
committed
Add new launch property to support work_group_scratch_memory
#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used. In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp. The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension. Signed-off-by: Victor Lomuller <[email protected]>
1 parent 38ee6ce commit 7222f79

19 files changed

+252
-94
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9537,6 +9537,7 @@ typedef enum ur_exp_launch_property_id_t {
95379537
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95389538
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95399539
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9540+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95409541
/// @cond
95419542
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95429543
/// @endcond
@@ -9550,10 +9551,12 @@ typedef enum ur_exp_launch_property_id_t {
95509551
/// _Analogues_
95519552
/// - **CUlaunchAttributeValue**
95529553
typedef union ur_exp_launch_property_value_t {
9553-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9554-
///< value must be a divisor of the corresponding global work-size
9555-
///< dimension (in units of work-group).
9556-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9554+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9555+
///< value must be a divisor of the corresponding global work-size
9556+
///< dimension (in units of work-group).
9557+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9558+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9559+
///< allocate in bytes
95579560

95589561
} ur_exp_launch_property_value_t;
95599562

@@ -9594,6 +9597,7 @@ typedef struct ur_exp_launch_property_t {
95949597
/// + NULL == hQueue
95959598
/// + NULL == hKernel
95969599
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9600+
/// + `NULL == pGlobalWorkOffset`
95979601
/// + `NULL == pGlobalWorkSize`
95989602
/// + `NULL == launchPropList`
95999603
/// + NULL == pGlobalWorkSize
@@ -9622,6 +9626,8 @@ urEnqueueKernelLaunchCustomExp(
96229626
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96239627
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96249628
///< work-group work-items
9629+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9630+
///< offset used to calculate the global ID of a work-item
96259631
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96269632
///< number of global work-items in workDim that will execute the kernel
96279633
///< function
@@ -11531,6 +11537,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1153111537
ur_queue_handle_t *phQueue;
1153211538
ur_kernel_handle_t *phKernel;
1153311539
uint32_t *pworkDim;
11540+
const size_t **ppGlobalWorkOffset;
1153411541
const size_t **ppGlobalWorkSize;
1153511542
const size_t **ppLocalWorkSize;
1153611543
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470+
const size_t *,
14701471
uint32_t,
14711472
const ur_exp_launch_property_t *,
14721473
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10319,6 +10319,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1031910319
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1032010320
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1032110321
break;
10322+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10323+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10324+
break;
1032210325
default:
1032310326
os << "unknown enumerator";
1032410327
break;
@@ -10355,6 +10358,13 @@ inline ur_result_t printUnion(
1035510358

1035610359
os << (params.cooperative);
1035710360

10361+
break;
10362+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10363+
10364+
os << ".workgroup_mem_size = ";
10365+
10366+
os << (params.workgroup_mem_size);
10367+
1035810368
break;
1035910369
default:
1036010370
os << "<unknown>";
@@ -15022,6 +15032,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1502215032

1502315033
os << *(params->pworkDim);
1502415034

15035+
os << ", ";
15036+
os << ".pGlobalWorkOffset = ";
15037+
15038+
ur::details::printPtr(os,
15039+
*(params->ppGlobalWorkOffset));
15040+
1502515041
os << ", ";
1502615042
os << ".pGlobalWorkSize = ";
1502715043

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
425+
static ur_result_t
426+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427+
uint32_t workDim, const size_t *pGlobalWorkOffset,
428+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429+
uint32_t numEventsInWaitList,
430+
const ur_event_handle_t *phEventWaitList,
431+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
430432
// Preconditions
431433
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
432434
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
444446
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
445447
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
446448

449+
// Set work group memory so we can compute the whole memory requirement
450+
if (WorkGroupMemory)
451+
hKernel->setWorkGroupMemory(WorkGroupMemory);
447452
uint32_t LocalSize = hKernel->getLocalSize();
448453
CUfunction CuFunc = hKernel->get();
449454

@@ -506,6 +511,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
506511
return UR_RESULT_SUCCESS;
507512
}
508513

514+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
515+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
516+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
517+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
518+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
519+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520+
pGlobalWorkSize, pLocalWorkSize,
521+
numEventsInWaitList, phEventWaitList, phEvent,
522+
/*WorkGroupMemory=*/0);
523+
}
524+
509525
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
510526
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
511527
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -516,8 +532,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
516532
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
517533
coop_prop.value.cooperative = 1;
518534
return urEnqueueKernelLaunchCustomExp(
519-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
520-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
535+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
536+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
537+
phEvent);
521538
}
522539
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
523540
pGlobalWorkSize, pLocalWorkSize,
@@ -526,16 +543,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
526543

527544
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
528545
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
529-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
530-
uint32_t numPropsInLaunchPropList,
546+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
547+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
531548
const ur_exp_launch_property_t *launchPropList,
532549
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
533550
ur_event_handle_t *phEvent) {
534551

535-
if (numPropsInLaunchPropList == 0) {
536-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
537-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
538-
phEvent);
552+
size_t WorkGroupMemory = [&]() -> size_t {
553+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
554+
launchPropList, launchPropList + numPropsInLaunchPropList,
555+
[](const ur_exp_launch_property_t &Prop) {
556+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
557+
});
558+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
559+
return WorkGroupMemoryProp->value.workgroup_mem_size;
560+
return 0;
561+
}();
562+
563+
if (numPropsInLaunchPropList == 0 ||
564+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
565+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
566+
pGlobalWorkSize, pLocalWorkSize,
567+
numEventsInWaitList, phEventWaitList, phEvent,
568+
WorkGroupMemory);
539569
}
540570
#if CUDA_VERSION >= 11080
541571
// Preconditions
@@ -548,7 +578,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
548578
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
549579
}
550580

551-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
581+
std::vector<CUlaunchAttribute> launch_attribute;
582+
launch_attribute.reserve(numPropsInLaunchPropList);
552583

553584
// Early exit for zero size kernel
554585
if (*pGlobalWorkSize == 0) {
@@ -561,40 +592,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
561592
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
562593
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
563594

595+
// Set work group memory so we can compute the whole memory requirement
596+
if (WorkGroupMemory)
597+
hKernel->setWorkGroupMemory(WorkGroupMemory);
564598
uint32_t LocalSize = hKernel->getLocalSize();
565599
CUfunction CuFunc = hKernel->get();
566600

567601
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
568602
switch (launchPropList[i].id) {
569603
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
570-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
604+
auto &attr = launch_attribute.emplace_back();
605+
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
571606
break;
572607
}
573608
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
574-
575-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
609+
auto &attr = launch_attribute.emplace_back();
610+
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
576611
// Note that cuda orders from right to left wrt SYCL dimensional order.
577612
if (workDim == 3) {
578-
launch_attribute[i].value.clusterDim.x =
579-
launchPropList[i].value.clusterDim[2];
580-
launch_attribute[i].value.clusterDim.y =
581-
launchPropList[i].value.clusterDim[1];
582-
launch_attribute[i].value.clusterDim.z =
583-
launchPropList[i].value.clusterDim[0];
613+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
614+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
615+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
584616
} else if (workDim == 2) {
585-
launch_attribute[i].value.clusterDim.x =
586-
launchPropList[i].value.clusterDim[1];
587-
launch_attribute[i].value.clusterDim.y =
588-
launchPropList[i].value.clusterDim[0];
589-
launch_attribute[i].value.clusterDim.z =
590-
launchPropList[i].value.clusterDim[2];
617+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
618+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
619+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
591620
} else {
592-
launch_attribute[i].value.clusterDim.x =
593-
launchPropList[i].value.clusterDim[0];
594-
launch_attribute[i].value.clusterDim.y =
595-
launchPropList[i].value.clusterDim[1];
596-
launch_attribute[i].value.clusterDim.z =
597-
launchPropList[i].value.clusterDim[2];
621+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
622+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
623+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
598624
}
599625

600626
UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -603,9 +629,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
603629
break;
604630
}
605631
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
606-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
607-
launch_attribute[i].value.cooperative =
608-
launchPropList[i].value.cooperative;
632+
auto &attr = launch_attribute.emplace_back();
633+
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
634+
attr.value.cooperative = launchPropList[i].value.cooperative;
635+
break;
636+
}
637+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
609638
break;
610639
}
611640
default: {
@@ -618,8 +647,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
618647
// using the standard UR_CHECK_ERROR
619648
if (ur_result_t Ret =
620649
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
621-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
622-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
650+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
651+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
623652
Ret != UR_RESULT_SUCCESS)
624653
return Ret;
625654

@@ -667,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
667696
launch_config.sharedMemBytes = LocalSize;
668697
launch_config.hStream = CuStream;
669698
launch_config.attrs = &launch_attribute[0];
670-
launch_config.numAttrs = numPropsInLaunchPropList;
699+
launch_config.numAttrs = launch_attribute.size();
671700

672701
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
673702
const_cast<void **>(ArgIndices.data()),

0 commit comments

Comments
 (0)