Skip to content

Commit 63a46d7

Browse files
authored
Merge pull request #2400 from callumfare/revert-1968
Revert "Add new launch property to support work_group_scratch_memory"
2 parents 56583f1 + 441fe7e commit 63a46d7

19 files changed

+94
-252
lines changed

include/ur_api.h

+4-11
Original file line numberDiff line numberDiff line change
@@ -9560,7 +9560,6 @@ typedef enum ur_exp_launch_property_id_t {
95609560
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95619561
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95629562
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9563-
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95649563
/// @cond
95659564
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95669565
/// @endcond
@@ -9574,12 +9573,10 @@ typedef enum ur_exp_launch_property_id_t {
95749573
/// _Analogues_
95759574
/// - **CUlaunchAttributeValue**
95769575
typedef union ur_exp_launch_property_value_t {
9577-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9578-
///< value must be a divisor of the corresponding global work-size
9579-
///< dimension (in units of work-group).
9580-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9581-
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9582-
///< allocate in bytes
9576+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9577+
///< value must be a divisor of the corresponding global work-size
9578+
///< dimension (in units of work-group).
9579+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
95839580

95849581
} ur_exp_launch_property_value_t;
95859582

@@ -9620,7 +9617,6 @@ typedef struct ur_exp_launch_property_t {
96209617
/// + NULL == hQueue
96219618
/// + NULL == hKernel
96229619
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9623-
/// + `NULL == pGlobalWorkOffset`
96249620
/// + `NULL == pGlobalWorkSize`
96259621
/// + `NULL == launchPropList`
96269622
/// + NULL == pGlobalWorkSize
@@ -9649,8 +9645,6 @@ urEnqueueKernelLaunchCustomExp(
96499645
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96509646
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96519647
///< work-group work-items
9652-
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9653-
///< offset used to calculate the global ID of a work-item
96549648
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96559649
///< number of global work-items in workDim that will execute the kernel
96569650
///< function
@@ -11560,7 +11554,6 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1156011554
ur_queue_handle_t *phQueue;
1156111555
ur_kernel_handle_t *phKernel;
1156211556
uint32_t *pworkDim;
11563-
const size_t **ppGlobalWorkOffset;
1156411557
const size_t **ppGlobalWorkSize;
1156511558
const size_t **ppLocalWorkSize;
1156611559
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

-1
Original file line numberDiff line numberDiff line change
@@ -1467,7 +1467,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470-
const size_t *,
14711470
uint32_t,
14721471
const ur_exp_launch_property_t *,
14731472
uint32_t,

include/ur_print.hpp

-16
Original file line numberDiff line numberDiff line change
@@ -10397,9 +10397,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1039710397
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1039810398
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1039910399
break;
10400-
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10401-
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10402-
break;
1040310400
default:
1040410401
os << "unknown enumerator";
1040510402
break;
@@ -10436,13 +10433,6 @@ inline ur_result_t printUnion(
1043610433

1043710434
os << (params.cooperative);
1043810435

10439-
break;
10440-
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10441-
10442-
os << ".workgroup_mem_size = ";
10443-
10444-
os << (params.workgroup_mem_size);
10445-
1044610436
break;
1044710437
default:
1044810438
os << "<unknown>";
@@ -15110,12 +15100,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1511015100

1511115101
os << *(params->pworkDim);
1511215102

15113-
os << ", ";
15114-
os << ".pGlobalWorkOffset = ";
15115-
15116-
ur::details::printPtr(os,
15117-
*(params->ppGlobalWorkOffset));
15118-
1511915103
os << ", ";
1512015104
os << ".pGlobalWorkSize = ";
1512115105

scripts/core/exp-launch-properties.yml

+2-11
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32-
- name: WORK_GROUP_MEMORY
33-
desc: "Implicit work group memory allocation"
3432
--- #--------------------------------------------------------------------------
3533
type: union
3634
desc: "Specifies a launch property value"
@@ -47,10 +45,6 @@ members:
4745
name: cooperative
4846
desc: "[in] non-zero value indicates a cooperative kernel"
4947
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50-
- type: size_t
51-
name: workgroup_mem_size
52-
desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes"
53-
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
5448
--- #--------------------------------------------------------------------------
5549
type: struct
5650
desc: "Kernel launch property"
@@ -88,9 +82,6 @@ params:
8882
- type: uint32_t
8983
name: workDim
9084
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91-
- type: "const size_t*"
92-
name: pGlobalWorkOffset
93-
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
9485
- type: const size_t*
9586
name: pGlobalWorkSize
9687
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -106,10 +97,10 @@ params:
10697
- type: uint32_t
10798
name: numEventsInWaitList
10899
desc: "[in] size of the event wait list"
109-
- type: const $x_event_handle_t*
100+
- type: const ur_event_handle_t*
110101
name: phEventWaitList
111102
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
112-
- type: $x_event_handle_t*
103+
- type: ur_event_handle_t*
113104
name: phEvent
114105
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
115106
returns:

source/adapters/cuda/enqueue.cpp

+41-70
Original file line numberDiff line numberDiff line change
@@ -422,13 +422,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
static ur_result_t
426-
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427-
uint32_t workDim, const size_t *pGlobalWorkOffset,
428-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429-
uint32_t numEventsInWaitList,
430-
const ur_event_handle_t *phEventWaitList,
431-
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
425+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
432430
// Preconditions
433431
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
434432
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -446,9 +444,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
446444
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
447445
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
448446

449-
// Set work group memory so we can compute the whole memory requirement
450-
if (WorkGroupMemory)
451-
hKernel->setWorkGroupMemory(WorkGroupMemory);
452447
uint32_t LocalSize = hKernel->getLocalSize();
453448
CUfunction CuFunc = hKernel->get();
454449

@@ -511,17 +506,6 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
511506
return UR_RESULT_SUCCESS;
512507
}
513508

514-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
515-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
516-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
517-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
518-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
519-
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520-
pGlobalWorkSize, pLocalWorkSize,
521-
numEventsInWaitList, phEventWaitList, phEvent,
522-
/*WorkGroupMemory=*/0);
523-
}
524-
525509
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
526510
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
527511
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -532,9 +516,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
532516
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
533517
coop_prop.value.cooperative = 1;
534518
return urEnqueueKernelLaunchCustomExp(
535-
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
536-
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
537-
phEvent);
519+
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
520+
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
538521
}
539522
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
540523
pGlobalWorkSize, pLocalWorkSize,
@@ -543,29 +526,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
543526

544527
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
545528
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
546-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
547-
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
529+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
530+
uint32_t numPropsInLaunchPropList,
548531
const ur_exp_launch_property_t *launchPropList,
549532
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
550533
ur_event_handle_t *phEvent) {
551534

552-
size_t WorkGroupMemory = [&]() -> size_t {
553-
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
554-
launchPropList, launchPropList + numPropsInLaunchPropList,
555-
[](const ur_exp_launch_property_t &Prop) {
556-
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
557-
});
558-
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
559-
return WorkGroupMemoryProp->value.workgroup_mem_size;
560-
return 0;
561-
}();
562-
563-
if (numPropsInLaunchPropList == 0 ||
564-
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
565-
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
566-
pGlobalWorkSize, pLocalWorkSize,
567-
numEventsInWaitList, phEventWaitList, phEvent,
568-
WorkGroupMemory);
535+
if (numPropsInLaunchPropList == 0) {
536+
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
537+
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
538+
phEvent);
569539
}
570540
#if CUDA_VERSION >= 11080
571541
// Preconditions
@@ -578,8 +548,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
578548
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
579549
}
580550

581-
std::vector<CUlaunchAttribute> launch_attribute;
582-
launch_attribute.reserve(numPropsInLaunchPropList);
551+
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
583552

584553
// Early exit for zero size kernel
585554
if (*pGlobalWorkSize == 0) {
@@ -592,35 +561,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
592561
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
593562
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
594563

595-
// Set work group memory so we can compute the whole memory requirement
596-
if (WorkGroupMemory)
597-
hKernel->setWorkGroupMemory(WorkGroupMemory);
598564
uint32_t LocalSize = hKernel->getLocalSize();
599565
CUfunction CuFunc = hKernel->get();
600566

601567
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
602568
switch (launchPropList[i].id) {
603569
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
604-
auto &attr = launch_attribute.emplace_back();
605-
attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE;
570+
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
606571
break;
607572
}
608573
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
609-
auto &attr = launch_attribute.emplace_back();
610-
attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
574+
575+
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
611576
// Note that cuda orders from right to left wrt SYCL dimensional order.
612577
if (workDim == 3) {
613-
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[2];
614-
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
615-
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[0];
578+
launch_attribute[i].value.clusterDim.x =
579+
launchPropList[i].value.clusterDim[2];
580+
launch_attribute[i].value.clusterDim.y =
581+
launchPropList[i].value.clusterDim[1];
582+
launch_attribute[i].value.clusterDim.z =
583+
launchPropList[i].value.clusterDim[0];
616584
} else if (workDim == 2) {
617-
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[1];
618-
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[0];
619-
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
585+
launch_attribute[i].value.clusterDim.x =
586+
launchPropList[i].value.clusterDim[1];
587+
launch_attribute[i].value.clusterDim.y =
588+
launchPropList[i].value.clusterDim[0];
589+
launch_attribute[i].value.clusterDim.z =
590+
launchPropList[i].value.clusterDim[2];
620591
} else {
621-
attr.value.clusterDim.x = launchPropList[i].value.clusterDim[0];
622-
attr.value.clusterDim.y = launchPropList[i].value.clusterDim[1];
623-
attr.value.clusterDim.z = launchPropList[i].value.clusterDim[2];
592+
launch_attribute[i].value.clusterDim.x =
593+
launchPropList[i].value.clusterDim[0];
594+
launch_attribute[i].value.clusterDim.y =
595+
launchPropList[i].value.clusterDim[1];
596+
launch_attribute[i].value.clusterDim.z =
597+
launchPropList[i].value.clusterDim[2];
624598
}
625599

626600
UR_CHECK_ERROR(cuFuncSetAttribute(
@@ -629,12 +603,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
629603
break;
630604
}
631605
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
632-
auto &attr = launch_attribute.emplace_back();
633-
attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
634-
attr.value.cooperative = launchPropList[i].value.cooperative;
635-
break;
636-
}
637-
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
606+
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
607+
launch_attribute[i].value.cooperative =
608+
launchPropList[i].value.cooperative;
638609
break;
639610
}
640611
default: {
@@ -647,8 +618,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
647618
// using the standard UR_CHECK_ERROR
648619
if (ur_result_t Ret =
649620
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
650-
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
651-
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
621+
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
622+
CuFunc, ThreadsPerBlock, BlocksPerGrid);
652623
Ret != UR_RESULT_SUCCESS)
653624
return Ret;
654625

@@ -696,7 +667,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
696667
launch_config.sharedMemBytes = LocalSize;
697668
launch_config.hStream = CuStream;
698669
launch_config.attrs = &launch_attribute[0];
699-
launch_config.numAttrs = launch_attribute.size();
670+
launch_config.numAttrs = numPropsInLaunchPropList;
700671

701672
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
702673
const_cast<void **>(ArgIndices.data()),

0 commit comments

Comments
 (0)