Skip to content

Commit 56583f1

Browse files
authored
Merge pull request #1968 from Naghasan/work_group_static
Add new launch property to support work_group_scratch_memory
2 parents 545781b + 5782497 commit 56583f1

19 files changed

+252
-94
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9560,6 +9560,7 @@ typedef enum ur_exp_launch_property_id_t {
95609560
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95619561
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95629562
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9563+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95639564
/// @cond
95649565
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95659566
/// @endcond
@@ -9573,10 +9574,12 @@ typedef enum ur_exp_launch_property_id_t {
95739574
/// _Analogues_
95749575
/// - **CUlaunchAttributeValue**
95759576
typedef union ur_exp_launch_property_value_t {
9576-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9577-
///< value must be a divisor of the corresponding global work-size
9578-
///< dimension (in units of work-group).
9579-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9577+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9578+
///< value must be a divisor of the corresponding global work-size
9579+
///< dimension (in units of work-group).
9580+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9581+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9582+
///< allocate in bytes
95809583

95819584
} ur_exp_launch_property_value_t;
95829585

@@ -9617,6 +9620,7 @@ typedef struct ur_exp_launch_property_t {
96179620
/// + NULL == hQueue
96189621
/// + NULL == hKernel
96199622
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9623+
/// + `NULL == pGlobalWorkOffset`
96209624
/// + `NULL == pGlobalWorkSize`
96219625
/// + `NULL == launchPropList`
96229626
/// + NULL == pGlobalWorkSize
@@ -9645,6 +9649,8 @@ urEnqueueKernelLaunchCustomExp(
96459649
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96469650
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96479651
///< work-group work-items
9652+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9653+
///< offset used to calculate the global ID of a work-item
96489654
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96499655
///< number of global work-items in workDim that will execute the kernel
96509656
///< function
@@ -11554,6 +11560,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1155411560
ur_queue_handle_t *phQueue;
1155511561
ur_kernel_handle_t *phKernel;
1155611562
uint32_t *pworkDim;
11563+
const size_t **ppGlobalWorkOffset;
1155711564
const size_t **ppGlobalWorkSize;
1155811565
const size_t **ppLocalWorkSize;
1155911566
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470+
const size_t *,
14701471
uint32_t,
14711472
const ur_exp_launch_property_t *,
14721473
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10397,6 +10397,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1039710397
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1039810398
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1039910399
break;
10400+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10401+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10402+
break;
1040010403
default:
1040110404
os << "unknown enumerator";
1040210405
break;
@@ -10433,6 +10436,13 @@ inline ur_result_t printUnion(
1043310436

1043410437
os << (params.cooperative);
1043510438

10439+
break;
10440+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10441+
10442+
os << ".workgroup_mem_size = ";
10443+
10444+
os << (params.workgroup_mem_size);
10445+
1043610446
break;
1043710447
default:
1043810448
os << "<unknown>";
@@ -15100,6 +15110,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1510015110

1510115111
os << *(params->pworkDim);
1510215112

15113+
os << ", ";
15114+
os << ".pGlobalWorkOffset = ";
15115+
15116+
ur::details::printPtr(os,
15117+
*(params->ppGlobalWorkOffset));
15118+
1510315119
os << ", ";
1510415120
os << ".pGlobalWorkSize = ";
1510515121

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 70 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
425+
static ur_result_t
426+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427+
uint32_t workDim, const size_t *pGlobalWorkOffset,
428+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429+
uint32_t numEventsInWaitList,
430+
const ur_event_handle_t *phEventWaitList,
431+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
430432
// Preconditions
431433
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
432434
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
444446
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
445447
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
446448

449+
// Set work group memory so we can compute the whole memory requirement
450+
if (WorkGroupMemory)
451+
hKernel->setWorkGroupMemory(WorkGroupMemory);
447452
uint32_t LocalSize = hKernel->getLocalSize();
448453
CUfunction CuFunc = hKernel->get();
449454

@@ -506,6 +511,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
506511
return UR_RESULT_SUCCESS;
507512
}
508513

514+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
515+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
516+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
517+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
518+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
519+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520+
pGlobalWorkSize, pLocalWorkSize,
521+
numEventsInWaitList, phEventWaitList, phEvent,
522+
/*WorkGroupMemory=*/0);
523+
}
524+
509525
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
510526
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
511527
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -516,8 +532,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
516532
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
517533
coop_prop.value.cooperative = 1;
518534
return urEnqueueKernelLaunchCustomExp(
519-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
520-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
535+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
536+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
537+
phEvent);
521538
}
522539
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
523540
pGlobalWorkSize, pLocalWorkSize,
@@ -526,16 +543,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
526543

527544
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
528545
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
529-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
530-
uint32_t numPropsInLaunchPropList,
546+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
547+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
531548
const ur_exp_launch_property_t *launchPropList,
532549
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
533550
ur_event_handle_t *phEvent) {
534551

535-
if (numPropsInLaunchPropList == 0) {
536-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
537-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
538-
phEvent);
552+
size_t WorkGroupMemory = [&]() -> size_t {
553+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
554+
launchPropList, launchPropList + numPropsInLaunchPropList,
555+
[](const ur_exp_launch_property_t &Prop) {
556+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
557+
});
558+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
559+
return WorkGroupMemoryProp->value.workgroup_mem_size;
560+
return 0;
561+
}();
562+
563+
if (numPropsInLaunchPropList == 0 ||
564+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
565+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
566+
pGlobalWorkSize, pLocalWorkSize,
567+
numEventsInWaitList, phEventWaitList, phEvent,
568+
WorkGroupMemory);
539569
}
540570
#if CUDA_VERSION >= 11080
541571
// Preconditions
@@ -548,7 +578,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
548578
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
549579
}
550580

551-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
581+
std::vector<CUlaunchAttribute> launch_attribute;
582+
launch_attribute.reserve(numPropsInLaunchPropList);
552583

553584
// Early exit for zero size kernel
554585
if (*pGlobalWorkSize == 0) {
@@ -561,40 +592,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
561592
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
562593
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
563594

595+
// Set work group memory so we can compute the whole memory requirement
596+
if (WorkGroupMemory)
597+
hKernel->setWorkGroupMemory(WorkGroupMemory);
564598
uint32_t LocalSize = hKernel->getLocalSize();
565599
CUfunction CuFunc = hKernel->get();
566600

567601
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
568602
switch (launchPropList[i].id) {
569603
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
570-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
604+
auto &attr = launch_attribute.emplace_back();
605+
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
571606
break;
572607
}
573608
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
574-
575-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
609+
auto &attr = launch_attribute.emplace_back();
610+
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
576611
// Note that cuda orders from right to left wrt SYCL dimensional order.
577612
if (workDim == 3) {
578-
launch_attribute[i].value.clusterDim.x =
579-
launchPropList[i].value.clusterDim[2];
580-
launch_attribute[i].value.clusterDim.y =
581-
launchPropList[i].value.clusterDim[1];
582-
launch_attribute[i].value.clusterDim.z =
583-
launchPropList[i].value.clusterDim[0];
613+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
614+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
615+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
584616
} else if (workDim == 2) {
585-
launch_attribute[i].value.clusterDim.x =
586-
launchPropList[i].value.clusterDim[1];
587-
launch_attribute[i].value.clusterDim.y =
588-
launchPropList[i].value.clusterDim[0];
589-
launch_attribute[i].value.clusterDim.z =
590-
launchPropList[i].value.clusterDim[2];
617+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
618+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
619+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
591620
} else {
592-
launch_attribute[i].value.clusterDim.x =
593-
launchPropList[i].value.clusterDim[0];
594-
launch_attribute[i].value.clusterDim.y =
595-
launchPropList[i].value.clusterDim[1];
596-
launch_attribute[i].value.clusterDim.z =
597-
launchPropList[i].value.clusterDim[2];
621+
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
622+
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
623+
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
598624
}
599625

600626
UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -603,9 +629,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
603629
break;
604630
}
605631
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
606-
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
607-
launch_attribute[i].value.cooperative =
608-
launchPropList[i].value.cooperative;
632+
auto &attr = launch_attribute.emplace_back();
633+
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
634+
attr.value.cooperative = launchPropList[i].value.cooperative;
635+
break;
636+
}
637+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
609638
break;
610639
}
611640
default: {
@@ -618,8 +647,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
618647
// using the standard UR_CHECK_ERROR
619648
if (ur_result_t Ret =
620649
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
621-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
622-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
650+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
651+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
623652
Ret != UR_RESULT_SUCCESS)
624653
return Ret;
625654

@@ -667,7 +696,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
667696
launch_config.sharedMemBytes = LocalSize;
668697
launch_config.hStream = CuStream;
669698
launch_config.attrs = &launch_attribute[0];
670-
launch_config.numAttrs = numPropsInLaunchPropList;
699+
launch_config.numAttrs = launch_attribute.size();
671700

672701
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
673702
const_cast<void **>(ArgIndices.data()),

0 commit comments

Comments
 (0)