Skip to content

Commit 234eebe

Browse files
ascollardalexdeucher
authored andcommitted
drm/amdkfd: APIs to stop/start KFD scheduling
Provide amdgpu_amdkfd_stop_sched() for amdgpu to stop KFD scheduling compute work on HIQ. amdgpu_amdkfd_start_sched() resumes the scheduling. When amdgpu_amdkfd_stop_sched is called, KFD will unmap queues from runlist. If users send ioctls to KFD to create queues, they'll be added but those queues won't be mapped to runlist (so not scheduled) until amdgpu_amdkfd_start_sched is called. v2: fix build (Alex) Signed-off-by: Amber Lin <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent b1f49ff commit 234eebe

File tree

5 files changed

+137
-1
lines changed

5 files changed

+137
-1
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -887,3 +887,21 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
887887

888888
return r;
889889
}
890+
891+
/* Stop scheduling on KFD */
892+
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id)
893+
{
894+
if (!adev->kfd.init_complete)
895+
return 0;
896+
897+
return kgd2kfd_stop_sched(adev->kfd.dev, node_id);
898+
}
899+
900+
/* Start scheduling on KFD */
901+
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
902+
{
903+
if (!adev->kfd.init_complete)
904+
return 0;
905+
906+
return kgd2kfd_start_sched(adev->kfd.dev, node_id);
907+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
264264
uint32_t *payload);
265265
int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
266266
u32 inst);
267+
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
268+
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);
267269

268270
/* Read user wptr from a specified user address space with page fault
269271
* disabled. The memory must be pinned and mapped to the hardware when
@@ -426,6 +428,8 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
426428
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
427429
int kgd2kfd_check_and_lock_kfd(void);
428430
void kgd2kfd_unlock_kfd(void);
431+
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
432+
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
429433
#else
430434
static inline int kgd2kfd_init(void)
431435
{
@@ -496,5 +500,15 @@ static inline int kgd2kfd_check_and_lock_kfd(void)
496500
static inline void kgd2kfd_unlock_kfd(void)
497501
{
498502
}
503+
504+
static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
505+
{
506+
return 0;
507+
}
508+
509+
static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
510+
{
511+
return 0;
512+
}
499513
#endif
500514
#endif /* AMDGPU_AMDKFD_H_INCLUDED */

drivers/gpu/drm/amd/amdkfd/kfd_device.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,6 +1446,45 @@ void kgd2kfd_unlock_kfd(void)
14461446
mutex_unlock(&kfd_processes_mutex);
14471447
}
14481448

1449+
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
1450+
{
1451+
struct kfd_node *node;
1452+
int ret;
1453+
1454+
if (!kfd->init_complete)
1455+
return 0;
1456+
1457+
if (node_id >= kfd->num_nodes) {
1458+
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
1459+
node_id, kfd->num_nodes - 1);
1460+
return -EINVAL;
1461+
}
1462+
node = kfd->nodes[node_id];
1463+
1464+
ret = node->dqm->ops.unhalt(node->dqm);
1465+
if (ret)
1466+
dev_err(kfd_device, "Error in starting scheduler\n");
1467+
1468+
return ret;
1469+
}
1470+
1471+
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
1472+
{
1473+
struct kfd_node *node;
1474+
1475+
if (!kfd->init_complete)
1476+
return 0;
1477+
1478+
if (node_id >= kfd->num_nodes) {
1479+
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
1480+
node_id, kfd->num_nodes - 1);
1481+
return -EINVAL;
1482+
}
1483+
1484+
node = kfd->nodes[node_id];
1485+
return node->dqm->ops.halt(node->dqm);
1486+
}
1487+
14491488
#if defined(CONFIG_DEBUG_FS)
14501489

14511490
/* This function will send a package to HIQ to hang the HWS

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1679,6 +1679,60 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
16791679
return 0;
16801680
}
16811681

1682+
/* halt_cpsch:
1683+
* Unmap queues so the schedule doesn't continue remaining jobs in the queue.
1684+
* Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch
1685+
* is called.
1686+
*/
1687+
static int halt_cpsch(struct device_queue_manager *dqm)
1688+
{
1689+
int ret = 0;
1690+
1691+
dqm_lock(dqm);
1692+
if (!dqm->sched_running) {
1693+
dqm_unlock(dqm);
1694+
return 0;
1695+
}
1696+
1697+
WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n");
1698+
1699+
if (!dqm->is_hws_hang) {
1700+
if (!dqm->dev->kfd->shared_resources.enable_mes)
1701+
ret = unmap_queues_cpsch(dqm,
1702+
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
1703+
USE_DEFAULT_GRACE_PERIOD, false);
1704+
else
1705+
ret = remove_all_queues_mes(dqm);
1706+
}
1707+
dqm->sched_halt = true;
1708+
dqm_unlock(dqm);
1709+
1710+
return ret;
1711+
}
1712+
1713+
/* unhalt_cpsch
1714+
* Unset dqm->sched_halt and map queues back to runlist
1715+
*/
1716+
static int unhalt_cpsch(struct device_queue_manager *dqm)
1717+
{
1718+
int ret = 0;
1719+
1720+
dqm_lock(dqm);
1721+
if (!dqm->sched_running || !dqm->sched_halt) {
1722+
WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n");
1723+
dqm_unlock(dqm);
1724+
return 0;
1725+
}
1726+
dqm->sched_halt = false;
1727+
if (!dqm->dev->kfd->shared_resources.enable_mes)
1728+
ret = execute_queues_cpsch(dqm,
1729+
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
1730+
0, USE_DEFAULT_GRACE_PERIOD);
1731+
dqm_unlock(dqm);
1732+
1733+
return ret;
1734+
}
1735+
16821736
static int start_cpsch(struct device_queue_manager *dqm)
16831737
{
16841738
struct device *dev = dqm->dev->adev->dev;
@@ -1984,7 +2038,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
19842038
struct device *dev = dqm->dev->adev->dev;
19852039
int retval;
19862040

1987-
if (!dqm->sched_running)
2041+
if (!dqm->sched_running || dqm->sched_halt)
19882042
return 0;
19892043
if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0)
19902044
return 0;
@@ -2727,6 +2781,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
27272781
dqm->ops.initialize = initialize_cpsch;
27282782
dqm->ops.start = start_cpsch;
27292783
dqm->ops.stop = stop_cpsch;
2784+
dqm->ops.halt = halt_cpsch;
2785+
dqm->ops.unhalt = unhalt_cpsch;
27302786
dqm->ops.destroy_queue = destroy_queue_cpsch;
27312787
dqm->ops.update_queue = update_queue;
27322788
dqm->ops.register_process = register_process;

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ union GRBM_GFX_INDEX_BITS {
106106
* @uninitialize: Destroys all the device queue manager resources allocated in
107107
* initialize routine.
108108
*
109+
* @halt: This routine unmaps queues from runlist and set halt status to true
110+
* so no more queues will be mapped to runlist until unhalt.
111+
*
112+
* @unhalt: This routine unset halt status to flase and maps queues back to
113+
* runlist.
114+
*
109115
* @create_kernel_queue: Creates kernel queue. Used for debug queue.
110116
*
111117
* @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
@@ -153,6 +159,8 @@ struct device_queue_manager_ops {
153159
int (*start)(struct device_queue_manager *dqm);
154160
int (*stop)(struct device_queue_manager *dqm);
155161
void (*uninitialize)(struct device_queue_manager *dqm);
162+
int (*halt)(struct device_queue_manager *dqm);
163+
int (*unhalt)(struct device_queue_manager *dqm);
156164
int (*create_kernel_queue)(struct device_queue_manager *dqm,
157165
struct kernel_queue *kq,
158166
struct qcm_process_device *qpd);
@@ -264,6 +272,7 @@ struct device_queue_manager {
264272
struct work_struct hw_exception_work;
265273
struct kfd_mem_obj hiq_sdma_mqd;
266274
bool sched_running;
275+
bool sched_halt;
267276

268277
/* used for GFX 9.4.3 only */
269278
uint32_t current_logical_xcc_start;

0 commit comments

Comments
 (0)