Skip to content

Commit 760edec

Browse files
harishchegondiashutoshx
authored andcommitted
drm/xe/eustall: Add support to read() and poll() EU stall data
Implement the EU stall sampling APIs to read() and poll() EU stall data. A work function periodically polls the EU stall data buffer write pointer registers to look for any new data and caches the write pointer. The read function compares the cached read and write pointers and copies any new data to the user space. v11: Used gt->eu_stall->stream_lock instead of stream->buf_lock. Removed read and write offsets from trace and added read size. Moved workqueue from struct xe_eu_stall_data_stream to struct xe_eu_stall_gt. v10: Used cancel_delayed_work_sync() instead of flush_delayed_work() Replaced per xecore lock with a lock for all the xecore buffers Code movement and optimizations as per review feedback v9: New patch split from the previous patch. Used *_delayed_work functions instead of hrtimer Addressed the review feedback in read and poll functions Reviewed-by: Ashutosh Dixit <[email protected]> Signed-off-by: Harish Chegondi <[email protected]> Signed-off-by: Ashutosh Dixit <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/369dee85a3b6bd2c08aeae89ca55e66a9a0242d2.1740533885.git.harish.chegondi@intel.com
1 parent 9a0b11d commit 760edec

File tree

2 files changed

+294
-3
lines changed

2 files changed

+294
-3
lines changed

drivers/gpu/drm/xe/xe_eu_stall.c

Lines changed: 264 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@
2121
#include "xe_macros.h"
2222
#include "xe_observation.h"
2323
#include "xe_pm.h"
24+
#include "xe_trace.h"
2425

2526
#include "regs/xe_eu_stall_regs.h"
2627
#include "regs/xe_gt_regs.h"
2728

29+
#define POLL_PERIOD_MS 5
30+
2831
static size_t per_xecore_buf_size = SZ_512K;
2932

3033
struct per_xecore_buf {
@@ -37,22 +40,27 @@ struct per_xecore_buf {
3740
};
3841

3942
struct xe_eu_stall_data_stream {
43+
bool pollin;
4044
bool enabled;
4145
int wait_num_reports;
4246
int sampling_rate_mult;
47+
wait_queue_head_t poll_wq;
4348
size_t data_record_size;
4449
size_t per_xecore_buf_size;
4550

4651
struct xe_gt *gt;
4752
struct xe_bo *bo;
4853
struct per_xecore_buf *xecore_buf;
54+
struct delayed_work buf_poll_work;
4955
};
5056

5157
struct xe_eu_stall_gt {
5258
/* Lock to protect stream */
5359
struct mutex stream_lock;
5460
/* EU stall data stream */
5561
struct xe_eu_stall_data_stream *stream;
62+
/* Workqueue to schedule buffer pointers polling work */
63+
struct workqueue_struct *buf_ptr_poll_wq;
5664
};
5765

5866
/**
@@ -114,6 +122,7 @@ static void xe_eu_stall_fini(void *arg)
114122
{
115123
struct xe_gt *gt = arg;
116124

125+
destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq);
117126
mutex_destroy(&gt->eu_stall->stream_lock);
118127
kfree(gt->eu_stall);
119128
}
@@ -139,11 +148,19 @@ int xe_eu_stall_init(struct xe_gt *gt)
139148

140149
mutex_init(&gt->eu_stall->stream_lock);
141150

151+
gt->eu_stall->buf_ptr_poll_wq = alloc_ordered_workqueue("xe_eu_stall", 0);
152+
if (!gt->eu_stall->buf_ptr_poll_wq) {
153+
ret = -ENOMEM;
154+
goto exit_free;
155+
}
156+
142157
ret = devm_add_action_or_reset(xe->drm.dev, xe_eu_stall_fini, gt);
143158
if (ret)
144-
goto exit_free;
159+
goto exit_destroy;
145160

146161
return 0;
162+
exit_destroy:
163+
destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq);
147164
exit_free:
148165
mutex_destroy(&gt->eu_stall->stream_lock);
149166
kfree(gt->eu_stall);
@@ -248,14 +265,214 @@ static int xe_eu_stall_user_extensions(struct xe_device *xe, u64 extension,
248265
return 0;
249266
}
250267

268+
/**
269+
* buf_data_size - Calculate the number of bytes in a circular buffer
270+
* given the read and write pointers and the size of
271+
* the buffer.
272+
*
273+
* @buf_size: Size of the circular buffer
274+
* @read_ptr: Read pointer with an additional overflow bit
275+
* @write_ptr: Write pointer with an additional overflow bit
276+
*
277+
* Since the read and write pointers have an additional overflow bit,
278+
* this function calculates the offsets from the pointers and use the
279+
* offsets to calculate the data size in the buffer.
280+
*
281+
* Returns: number of bytes of data in the buffer
282+
*/
283+
static u32 buf_data_size(size_t buf_size, u32 read_ptr, u32 write_ptr)
284+
{
285+
u32 read_offset, write_offset, size = 0;
286+
287+
if (read_ptr == write_ptr)
288+
goto exit;
289+
290+
read_offset = read_ptr & (buf_size - 1);
291+
write_offset = write_ptr & (buf_size - 1);
292+
293+
if (write_offset > read_offset)
294+
size = write_offset - read_offset;
295+
else
296+
size = buf_size - read_offset + write_offset;
297+
exit:
298+
return size;
299+
}
300+
301+
/**
302+
* eu_stall_data_buf_poll - Poll for EU stall data in the buffer.
303+
*
304+
* @stream: xe EU stall data stream instance
305+
*
306+
* Returns: true if the EU stall buffer contains minimum stall data as
307+
* specified by the event report count, else false.
308+
*/
309+
static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream)
310+
{
311+
u32 read_ptr, write_ptr_reg, write_ptr, total_data = 0;
312+
u32 buf_size = stream->per_xecore_buf_size;
313+
struct per_xecore_buf *xecore_buf;
314+
struct xe_gt *gt = stream->gt;
315+
bool min_data_present = false;
316+
u16 group, instance;
317+
unsigned int xecore;
318+
319+
mutex_lock(&gt->eu_stall->stream_lock);
320+
for_each_dss_steering(xecore, gt, group, instance) {
321+
xecore_buf = &stream->xecore_buf[xecore];
322+
read_ptr = xecore_buf->read;
323+
write_ptr_reg = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT,
324+
group, instance);
325+
write_ptr = REG_FIELD_GET(XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK, write_ptr_reg);
326+
write_ptr <<= 6;
327+
write_ptr &= ((buf_size << 1) - 1);
328+
if (!min_data_present) {
329+
total_data += buf_data_size(buf_size, read_ptr, write_ptr);
330+
if (num_data_rows(total_data) >= stream->wait_num_reports)
331+
min_data_present = true;
332+
}
333+
xecore_buf->write = write_ptr;
334+
}
335+
mutex_unlock(&gt->eu_stall->stream_lock);
336+
337+
return min_data_present;
338+
}
339+
340+
static int xe_eu_stall_data_buf_read(struct xe_eu_stall_data_stream *stream,
341+
char __user *buf, size_t count,
342+
size_t *total_data_size, struct xe_gt *gt,
343+
u16 group, u16 instance, unsigned int xecore)
344+
{
345+
size_t read_data_size, copy_size, buf_size;
346+
u32 read_ptr_reg, read_ptr, write_ptr;
347+
u8 *xecore_start_vaddr, *read_vaddr;
348+
struct per_xecore_buf *xecore_buf;
349+
u32 read_offset, write_offset;
350+
351+
/* Hardware increments the read and write pointers such that they can
352+
* overflow into one additional bit. For example, a 256KB size buffer
353+
* offset pointer needs 18 bits. But HW uses 19 bits for the read and
354+
* write pointers. This technique avoids wasting a slot in the buffer.
355+
* Read and write offsets are calculated from the pointers in order to
356+
* check if the write pointer has wrapped around the array.
357+
*/
358+
xecore_buf = &stream->xecore_buf[xecore];
359+
xecore_start_vaddr = xecore_buf->vaddr;
360+
read_ptr = xecore_buf->read;
361+
write_ptr = xecore_buf->write;
362+
buf_size = stream->per_xecore_buf_size;
363+
364+
read_data_size = buf_data_size(buf_size, read_ptr, write_ptr);
365+
/* Read only the data that the user space buffer can accommodate */
366+
read_data_size = min_t(size_t, count - *total_data_size, read_data_size);
367+
if (read_data_size == 0)
368+
return 0;
369+
370+
read_offset = read_ptr & (buf_size - 1);
371+
write_offset = write_ptr & (buf_size - 1);
372+
read_vaddr = xecore_start_vaddr + read_offset;
373+
374+
if (write_offset > read_offset) {
375+
if (copy_to_user(buf + *total_data_size, read_vaddr, read_data_size))
376+
return -EFAULT;
377+
} else {
378+
if (read_data_size >= buf_size - read_offset)
379+
copy_size = buf_size - read_offset;
380+
else
381+
copy_size = read_data_size;
382+
if (copy_to_user(buf + *total_data_size, read_vaddr, copy_size))
383+
return -EFAULT;
384+
if (copy_to_user(buf + *total_data_size + copy_size,
385+
xecore_start_vaddr, read_data_size - copy_size))
386+
return -EFAULT;
387+
}
388+
389+
*total_data_size += read_data_size;
390+
read_ptr += read_data_size;
391+
392+
/* Read pointer can overflow into one additional bit */
393+
read_ptr &= (buf_size << 1) - 1;
394+
read_ptr_reg = REG_FIELD_PREP(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, (read_ptr >> 6));
395+
read_ptr_reg = _MASKED_FIELD(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, read_ptr_reg);
396+
xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT1, read_ptr_reg, group, instance);
397+
xecore_buf->read = read_ptr;
398+
trace_xe_eu_stall_data_read(group, instance, read_ptr, write_ptr,
399+
read_data_size, *total_data_size);
400+
return 0;
401+
}
402+
403+
/**
404+
* xe_eu_stall_stream_read_locked - copy EU stall counters data from the
405+
* per xecore buffers to the userspace buffer
406+
* @stream: A stream opened for EU stall count metrics
407+
* @file: An xe EU stall data stream file
408+
* @buf: destination buffer given by userspace
409+
* @count: the number of bytes userspace wants to read
410+
*
411+
* Returns: Number of bytes copied or a negative error code
412+
* If we've successfully copied any data then reporting that takes
413+
* precedence over any internal error status, so the data isn't lost.
414+
*/
415+
static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *stream,
416+
struct file *file, char __user *buf,
417+
size_t count)
418+
{
419+
struct xe_gt *gt = stream->gt;
420+
size_t total_size = 0;
421+
u16 group, instance;
422+
unsigned int xecore;
423+
int ret = 0;
424+
425+
for_each_dss_steering(xecore, gt, group, instance) {
426+
ret = xe_eu_stall_data_buf_read(stream, buf, count, &total_size,
427+
gt, group, instance, xecore);
428+
if (ret || count == total_size)
429+
break;
430+
}
431+
return total_size ?: (ret ?: -EAGAIN);
432+
}
433+
251434
/*
252435
* Userspace must enable the EU stall stream with DRM_XE_OBSERVATION_IOCTL_ENABLE
253436
* before calling read().
254437
*/
255438
static ssize_t xe_eu_stall_stream_read(struct file *file, char __user *buf,
256439
size_t count, loff_t *ppos)
257440
{
258-
ssize_t ret = 0;
441+
struct xe_eu_stall_data_stream *stream = file->private_data;
442+
struct xe_gt *gt = stream->gt;
443+
ssize_t ret, aligned_count;
444+
445+
aligned_count = ALIGN_DOWN(count, stream->data_record_size);
446+
if (aligned_count == 0)
447+
return -EINVAL;
448+
449+
if (!stream->enabled) {
450+
xe_gt_dbg(gt, "EU stall data stream not enabled to read\n");
451+
return -EINVAL;
452+
}
453+
454+
if (!(file->f_flags & O_NONBLOCK)) {
455+
do {
456+
ret = wait_event_interruptible(stream->poll_wq, stream->pollin);
457+
if (ret)
458+
return -EINTR;
459+
460+
mutex_lock(&gt->eu_stall->stream_lock);
461+
ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count);
462+
mutex_unlock(&gt->eu_stall->stream_lock);
463+
} while (ret == -EAGAIN);
464+
} else {
465+
mutex_lock(&gt->eu_stall->stream_lock);
466+
ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count);
467+
mutex_unlock(&gt->eu_stall->stream_lock);
468+
}
469+
470+
/*
471+
* This may not work correctly if the user buffer is very small.
472+
* We don't want to block the next read() when there is data in the buffer
473+
* now, but couldn't be accommodated in the small user buffer.
474+
*/
475+
stream->pollin = false;
259476

260477
return ret;
261478
}
@@ -348,6 +565,21 @@ static int xe_eu_stall_stream_enable(struct xe_eu_stall_data_stream *stream)
348565
return 0;
349566
}
350567

568+
static void eu_stall_data_buf_poll_work_fn(struct work_struct *work)
569+
{
570+
struct xe_eu_stall_data_stream *stream =
571+
container_of(work, typeof(*stream), buf_poll_work.work);
572+
struct xe_gt *gt = stream->gt;
573+
574+
if (eu_stall_data_buf_poll(stream)) {
575+
stream->pollin = true;
576+
wake_up(&stream->poll_wq);
577+
}
578+
queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq,
579+
&stream->buf_poll_work,
580+
msecs_to_jiffies(POLL_PERIOD_MS));
581+
}
582+
351583
static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
352584
struct eu_stall_open_properties *props)
353585
{
@@ -372,6 +604,9 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
372604
max_wait_num_reports);
373605
return -EINVAL;
374606
}
607+
608+
init_waitqueue_head(&stream->poll_wq);
609+
INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn);
375610
stream->per_xecore_buf_size = per_xecore_buf_size;
376611
stream->sampling_rate_mult = props->sampling_rate_mult;
377612
stream->wait_num_reports = props->wait_num_reports;
@@ -389,15 +624,35 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream,
389624
return 0;
390625
}
391626

627+
static __poll_t xe_eu_stall_stream_poll_locked(struct xe_eu_stall_data_stream *stream,
628+
struct file *file, poll_table *wait)
629+
{
630+
__poll_t events = 0;
631+
632+
poll_wait(file, &stream->poll_wq, wait);
633+
634+
if (stream->pollin)
635+
events |= EPOLLIN;
636+
637+
return events;
638+
}
639+
392640
static __poll_t xe_eu_stall_stream_poll(struct file *file, poll_table *wait)
393641
{
394-
__poll_t ret = 0;
642+
struct xe_eu_stall_data_stream *stream = file->private_data;
643+
struct xe_gt *gt = stream->gt;
644+
__poll_t ret;
645+
646+
mutex_lock(&gt->eu_stall->stream_lock);
647+
ret = xe_eu_stall_stream_poll_locked(stream, file, wait);
648+
mutex_unlock(&gt->eu_stall->stream_lock);
395649

396650
return ret;
397651
}
398652

399653
static int xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream)
400654
{
655+
struct xe_gt *gt = stream->gt;
401656
int ret = 0;
402657

403658
if (stream->enabled)
@@ -406,6 +661,10 @@ static int xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream)
406661
stream->enabled = true;
407662

408663
ret = xe_eu_stall_stream_enable(stream);
664+
665+
queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq,
666+
&stream->buf_poll_work,
667+
msecs_to_jiffies(POLL_PERIOD_MS));
409668
return ret;
410669
}
411670

@@ -420,6 +679,8 @@ static int xe_eu_stall_disable_locked(struct xe_eu_stall_data_stream *stream)
420679

421680
xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, 0);
422681

682+
cancel_delayed_work_sync(&stream->buf_poll_work);
683+
423684
xe_force_wake_put(gt_to_fw(gt), XE_FW_RENDER);
424685
xe_pm_runtime_put(gt_to_xe(gt));
425686

0 commit comments

Comments
 (0)