Skip to content

Commit 8f25e5a

Browse files
author
Thomas Hellström
committed
drm/xe: Convert existing drm_exec transactions for exhaustive eviction
Convert existing drm_exec transactions, like GT pagefault validation, non-LR exec() IOCTL and the rebind worker to support exhaustive eviction using the xe_validation_guard(). v2: - Adapt to signature change in xe_validation_guard() (Matt Brost) - Avoid gotos from within xe_validation_guard() (Matt Brost) - Check error return from xe_validation_guard() v3: - Rebase on gpu_madvise() Signed-off-by: Thomas Hellström <[email protected]> Reviewed-by: Matthew Brost <[email protected]> #v1 Link: https://lore.kernel.org/r/[email protected]
1 parent 1710cd5 commit 8f25e5a

File tree

4 files changed

+75
-106
lines changed

4 files changed

+75
-106
lines changed

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,10 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
120120
struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
121121
struct drm_exec *exec = &vm_exec.exec;
122122
u32 i, num_syncs, num_ufence = 0;
123+
struct xe_validation_ctx ctx;
123124
struct xe_sched_job *job;
124125
struct xe_vm *vm;
125126
bool write_locked, skip_retry = false;
126-
ktime_t end = 0;
127127
int err = 0;
128128
struct xe_hw_engine_group *group;
129129
enum xe_hw_engine_group_execution_mode mode, previous_mode;
@@ -251,17 +251,12 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
251251
if (err)
252252
goto err_unlock_list;
253253

254-
vm_exec.vm = &vm->gpuvm;
255-
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
256-
if (xe_vm_in_lr_mode(vm)) {
257-
drm_exec_init(exec, vm_exec.flags, 0);
258-
} else {
259-
err = drm_gpuvm_exec_lock(&vm_exec);
260-
if (err) {
261-
if (xe_vm_validate_should_retry(exec, err, &end))
262-
err = -EAGAIN;
254+
if (!xe_vm_in_lr_mode(vm)) {
255+
vm_exec.vm = &vm->gpuvm;
256+
vm_exec.flags = DRM_EXEC_INTERRUPTIBLE_WAIT;
257+
err = xe_validation_exec_lock(&ctx, &vm_exec, &xe->val);
258+
if (err)
263259
goto err_unlock_list;
264-
}
265260
}
266261

267262
if (xe_vm_is_closed_or_banned(q->vm)) {
@@ -355,7 +350,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
355350
if (err)
356351
xe_sched_job_put(job);
357352
err_exec:
358-
drm_exec_fini(exec);
353+
if (!xe_vm_in_lr_mode(vm))
354+
xe_validation_ctx_fini(&ctx);
359355
err_unlock_list:
360356
up_read(&vm->lock);
361357
if (err == -EAGAIN && !skip_retry)

drivers/gpu/drm/xe/xe_gt_pagefault.c

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
9696
{
9797
struct xe_vm *vm = xe_vma_vm(vma);
9898
struct xe_tile *tile = gt_to_tile(gt);
99+
struct xe_validation_ctx ctx;
99100
struct drm_exec exec;
100101
struct dma_fence *fence;
101-
ktime_t end = 0;
102102
int err, needs_vram;
103103

104104
lockdep_assert_held_write(&vm->lock);
@@ -127,12 +127,11 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
127127
}
128128

129129
/* Lock VM and BOs dma-resv */
130-
drm_exec_init(&exec, 0, 0);
130+
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
131131
drm_exec_until_all_locked(&exec) {
132132
err = xe_pf_begin(&exec, vma, needs_vram == 1, tile->mem.vram);
133133
drm_exec_retry_on_contention(&exec);
134-
if (xe_vm_validate_should_retry(&exec, err, &end))
135-
err = -EAGAIN;
134+
xe_validation_retry_on_oom(&ctx, &err);
136135
if (err)
137136
goto unlock_dma_resv;
138137

@@ -143,8 +142,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
143142
xe_vm_set_validation_exec(vm, NULL);
144143
if (IS_ERR(fence)) {
145144
err = PTR_ERR(fence);
146-
if (xe_vm_validate_should_retry(&exec, err, &end))
147-
err = -EAGAIN;
145+
xe_validation_retry_on_oom(&ctx, &err);
148146
goto unlock_dma_resv;
149147
}
150148
}
@@ -153,7 +151,7 @@ static int handle_vma_pagefault(struct xe_gt *gt, struct xe_vma *vma,
153151
dma_fence_put(fence);
154152

155153
unlock_dma_resv:
156-
drm_exec_fini(&exec);
154+
xe_validation_ctx_fini(&ctx);
157155
if (err == -EAGAIN)
158156
goto retry_userptr;
159157

@@ -535,6 +533,7 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
535533
{
536534
struct xe_device *xe = gt_to_xe(gt);
537535
struct xe_tile *tile = gt_to_tile(gt);
536+
struct xe_validation_ctx ctx;
538537
struct drm_exec exec;
539538
struct xe_vm *vm;
540539
struct xe_vma *vma;
@@ -564,15 +563,14 @@ static int handle_acc(struct xe_gt *gt, struct acc *acc)
564563
goto unlock_vm;
565564

566565
/* Lock VM and BOs dma-resv */
567-
drm_exec_init(&exec, 0, 0);
566+
xe_validation_ctx_init(&ctx, &vm->xe->val, &exec, (struct xe_val_flags) {});
568567
drm_exec_until_all_locked(&exec) {
569568
ret = xe_pf_begin(&exec, vma, IS_DGFX(vm->xe), tile->mem.vram);
570569
drm_exec_retry_on_contention(&exec);
571-
if (ret)
572-
break;
570+
xe_validation_retry_on_oom(&ctx, &ret);
573571
}
574572

575-
drm_exec_fini(&exec);
573+
xe_validation_ctx_fini(&ctx);
576574
unlock_vm:
577575
up_read(&vm->lock);
578576
xe_vm_put(vm);

drivers/gpu/drm/xe/xe_vm.c

Lines changed: 58 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -210,14 +210,15 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
210210
.num_fences = 1,
211211
};
212212
struct drm_exec *exec = &vm_exec.exec;
213+
struct xe_validation_ctx ctx;
213214
struct dma_fence *pfence;
214215
int err;
215216
bool wait;
216217

217218
xe_assert(vm->xe, xe_vm_in_preempt_fence_mode(vm));
218219

219220
down_write(&vm->lock);
220-
err = drm_gpuvm_exec_lock(&vm_exec);
221+
err = xe_validation_exec_lock(&ctx, &vm_exec, &vm->xe->val);
221222
if (err)
222223
goto out_up_write;
223224

@@ -249,7 +250,7 @@ int xe_vm_add_compute_exec_queue(struct xe_vm *vm, struct xe_exec_queue *q)
249250
xe_svm_notifier_unlock(vm);
250251

251252
out_fini:
252-
drm_exec_fini(exec);
253+
xe_validation_ctx_fini(&ctx);
253254
out_up_write:
254255
up_write(&vm->lock);
255256

@@ -313,39 +314,6 @@ void xe_vm_kill(struct xe_vm *vm, bool unlocked)
313314
/* TODO: Inform user the VM is banned */
314315
}
315316

316-
/**
317-
* xe_vm_validate_should_retry() - Whether to retry after a validate error.
318-
* @exec: The drm_exec object used for locking before validation.
319-
* @err: The error returned from ttm_bo_validate().
320-
* @end: A ktime_t cookie that should be set to 0 before first use and
321-
* that should be reused on subsequent calls.
322-
*
323-
* With multiple active VMs, under memory pressure, it is possible that
324-
* ttm_bo_validate() run into -EDEADLK and in such case returns -ENOMEM.
325-
* Until ttm properly handles locking in such scenarios, best thing the
326-
* driver can do is retry with a timeout. Check if that is necessary, and
327-
* if so unlock the drm_exec's objects while keeping the ticket to prepare
328-
* for a rerun.
329-
*
330-
* Return: true if a retry after drm_exec_init() is recommended;
331-
* false otherwise.
332-
*/
333-
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end)
334-
{
335-
ktime_t cur;
336-
337-
if (err != -ENOMEM)
338-
return false;
339-
340-
cur = ktime_get();
341-
*end = *end ? : ktime_add_ms(cur, XE_VM_REBIND_RETRY_TIMEOUT_MS);
342-
if (!ktime_before(cur, *end))
343-
return false;
344-
345-
msleep(20);
346-
return true;
347-
}
348-
349317
static int xe_gpuvm_validate(struct drm_gpuvm_bo *vm_bo, struct drm_exec *exec)
350318
{
351319
struct xe_vm *vm = gpuvm_to_vm(vm_bo->vm);
@@ -476,10 +444,10 @@ void xe_vm_resume_rebind_worker(struct xe_vm *vm)
476444
static void preempt_rebind_work_func(struct work_struct *w)
477445
{
478446
struct xe_vm *vm = container_of(w, struct xe_vm, preempt.rebind_work);
447+
struct xe_validation_ctx ctx;
479448
struct drm_exec exec;
480449
unsigned int fence_count = 0;
481450
LIST_HEAD(preempt_fences);
482-
ktime_t end = 0;
483451
int err = 0;
484452
long wait;
485453
int __maybe_unused tries = 0;
@@ -507,18 +475,19 @@ static void preempt_rebind_work_func(struct work_struct *w)
507475
goto out_unlock_outer;
508476
}
509477

510-
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
478+
err = xe_validation_ctx_init(&ctx, &vm->xe->val, &exec,
479+
(struct xe_val_flags) {.interruptible = true});
480+
if (err)
481+
goto out_unlock_outer;
511482

512483
drm_exec_until_all_locked(&exec) {
513484
bool done = false;
514485

515486
err = xe_preempt_work_begin(&exec, vm, &done);
516487
drm_exec_retry_on_contention(&exec);
488+
xe_validation_retry_on_oom(&ctx, &err);
517489
if (err || done) {
518-
drm_exec_fini(&exec);
519-
if (err && xe_vm_validate_should_retry(&exec, err, &end))
520-
err = -EAGAIN;
521-
490+
xe_validation_ctx_fini(&ctx);
522491
goto out_unlock_outer;
523492
}
524493
}
@@ -566,7 +535,7 @@ static void preempt_rebind_work_func(struct work_struct *w)
566535
xe_svm_notifier_unlock(vm);
567536

568537
out_unlock:
569-
drm_exec_fini(&exec);
538+
xe_validation_ctx_fini(&ctx);
570539
out_unlock_outer:
571540
if (err == -EAGAIN) {
572541
trace_xe_vm_rebind_worker_retry(vm);
@@ -1164,20 +1133,19 @@ int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma)
11641133

11651134
static void xe_vma_destroy_unlocked(struct xe_vma *vma)
11661135
{
1136+
struct xe_device *xe = xe_vma_vm(vma)->xe;
1137+
struct xe_validation_ctx ctx;
11671138
struct drm_exec exec;
1168-
int err;
1139+
int err = 0;
11691140

1170-
drm_exec_init(&exec, 0, 0);
1171-
drm_exec_until_all_locked(&exec) {
1141+
xe_validation_guard(&ctx, &xe->val, &exec, (struct xe_val_flags) {}, err) {
11721142
err = xe_vm_lock_vma(&exec, vma);
11731143
drm_exec_retry_on_contention(&exec);
11741144
if (XE_WARN_ON(err))
11751145
break;
1146+
xe_vma_destroy(vma, NULL);
11761147
}
1177-
1178-
xe_vma_destroy(vma, NULL);
1179-
1180-
drm_exec_fini(&exec);
1148+
xe_assert(xe, !err);
11811149
}
11821150

11831151
struct xe_vma *
@@ -2383,16 +2351,17 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
23832351
struct xe_vma_mem_attr *attr, unsigned int flags)
23842352
{
23852353
struct xe_bo *bo = op->gem.obj ? gem_to_xe_bo(op->gem.obj) : NULL;
2354+
struct xe_validation_ctx ctx;
23862355
struct drm_exec exec;
23872356
struct xe_vma *vma;
23882357
int err = 0;
23892358

23902359
lockdep_assert_held_write(&vm->lock);
23912360

23922361
if (bo) {
2393-
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
2394-
drm_exec_until_all_locked(&exec) {
2395-
err = 0;
2362+
err = 0;
2363+
xe_validation_guard(&ctx, &vm->xe->val, &exec,
2364+
(struct xe_val_flags) {.interruptible = true}, err) {
23962365
if (!bo->vm) {
23972366
err = drm_exec_lock_obj(&exec, xe_vm_obj(vm));
23982367
drm_exec_retry_on_contention(&exec);
@@ -2401,27 +2370,35 @@ static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
24012370
err = drm_exec_lock_obj(&exec, &bo->ttm.base);
24022371
drm_exec_retry_on_contention(&exec);
24032372
}
2404-
if (err) {
2405-
drm_exec_fini(&exec);
2373+
if (err)
24062374
return ERR_PTR(err);
2407-
}
2408-
}
2409-
}
2410-
vma = xe_vma_create(vm, bo, op->gem.offset,
2411-
op->va.addr, op->va.addr +
2412-
op->va.range - 1, attr, flags);
2413-
if (IS_ERR(vma))
2414-
goto err_unlock;
24152375

2416-
if (xe_vma_is_userptr(vma))
2417-
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2418-
else if (!xe_vma_has_no_bo(vma) && !bo->vm)
2419-
err = add_preempt_fences(vm, bo);
2376+
vma = xe_vma_create(vm, bo, op->gem.offset,
2377+
op->va.addr, op->va.addr +
2378+
op->va.range - 1, attr, flags);
2379+
if (IS_ERR(vma))
2380+
return vma;
24202381

2421-
err_unlock:
2422-
if (bo)
2423-
drm_exec_fini(&exec);
2382+
if (!bo->vm) {
2383+
err = add_preempt_fences(vm, bo);
2384+
if (err) {
2385+
prep_vma_destroy(vm, vma, false);
2386+
xe_vma_destroy(vma, NULL);
2387+
}
2388+
}
2389+
}
2390+
if (err)
2391+
return ERR_PTR(err);
2392+
} else {
2393+
vma = xe_vma_create(vm, NULL, op->gem.offset,
2394+
op->va.addr, op->va.addr +
2395+
op->va.range - 1, attr, flags);
2396+
if (IS_ERR(vma))
2397+
return vma;
24242398

2399+
if (xe_vma_is_userptr(vma))
2400+
err = xe_vma_userptr_pin_pages(to_userptr_vma(vma));
2401+
}
24252402
if (err) {
24262403
prep_vma_destroy(vm, vma, false);
24272404
xe_vma_destroy_unlocked(vma);
@@ -3220,37 +3197,37 @@ static void vm_bind_ioctl_ops_fini(struct xe_vm *vm, struct xe_vma_ops *vops,
32203197
static struct dma_fence *vm_bind_ioctl_ops_execute(struct xe_vm *vm,
32213198
struct xe_vma_ops *vops)
32223199
{
3200+
struct xe_validation_ctx ctx;
32233201
struct drm_exec exec;
32243202
struct dma_fence *fence;
3225-
int err;
3203+
int err = 0;
32263204

32273205
lockdep_assert_held_write(&vm->lock);
32283206

3229-
drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
3230-
DRM_EXEC_IGNORE_DUPLICATES, 0);
3231-
drm_exec_until_all_locked(&exec) {
3207+
xe_validation_guard(&ctx, &vm->xe->val, &exec,
3208+
((struct xe_val_flags) {
3209+
.interruptible = true,
3210+
.exec_ignore_duplicates = true,
3211+
}), err) {
32323212
err = vm_bind_ioctl_ops_lock_and_prep(&exec, vm, vops);
32333213
drm_exec_retry_on_contention(&exec);
3234-
if (err) {
3235-
fence = ERR_PTR(err);
3236-
goto unlock;
3237-
}
3214+
xe_validation_retry_on_oom(&ctx, &err);
3215+
if (err)
3216+
return ERR_PTR(err);
32383217

32393218
xe_vm_set_validation_exec(vm, &exec);
32403219
fence = ops_execute(vm, vops);
32413220
xe_vm_set_validation_exec(vm, NULL);
32423221
if (IS_ERR(fence)) {
32433222
if (PTR_ERR(fence) == -ENODATA)
32443223
vm_bind_ioctl_ops_fini(vm, vops, NULL);
3245-
goto unlock;
3224+
return fence;
32463225
}
32473226

32483227
vm_bind_ioctl_ops_fini(vm, vops, fence);
32493228
}
32503229

3251-
unlock:
3252-
drm_exec_fini(&exec);
3253-
return fence;
3230+
return err ? ERR_PTR(err) : fence;
32543231
}
32553232
ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
32563233

drivers/gpu/drm/xe/xe_vm.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,6 @@ static inline void xe_vm_reactivate_rebind(struct xe_vm *vm)
260260
}
261261
}
262262

263-
bool xe_vm_validate_should_retry(struct drm_exec *exec, int err, ktime_t *end);
264-
265263
int xe_vm_lock_vma(struct drm_exec *exec, struct xe_vma *vma);
266264

267265
int xe_vm_validate_rebind(struct xe_vm *vm, struct drm_exec *exec,

0 commit comments

Comments
 (0)