Skip to content

Commit 194bb58

Browse files
committed
io_uring: add support for futex wake and wait
Add support for FUTEX_WAKE/WAIT primitives. IORING_OP_FUTEX_WAKE is mix of FUTEX_WAKE and FUTEX_WAKE_BITSET, as it does support passing in a bitset. Similary, IORING_OP_FUTEX_WAIT is a mix of FUTEX_WAIT and FUTEX_WAIT_BITSET. For both of them, they are using the futex2 interface. FUTEX_WAKE is straight forward, as those can always be done directly from the io_uring submission without needing async handling. For FUTEX_WAIT, things are a bit more complicated. If the futex isn't ready, then we rely on a callback via futex_queue->wake() when someone wakes up the futex. From that calback, we queue up task_work with the original task, which will post a CQE and wake it, if necessary. Cancelations are supported, both from the application point-of-view, but also to be able to cancel pending waits if the ring exits before all events have occurred. The return value of futex_unqueue() is used to gate who wins the potential race between cancelation and futex wakeups. Whomever gets a 'ret == 1' return from that claims ownership of the io_uring futex request. This is just the barebones wait/wake support. PI or REQUEUE support is not added at this point, unclear if we might look into that later. Likewise, explicit timeouts are not supported either. It is expected that users that need timeouts would do so via the usual io_uring mechanism to do that using linked timeouts. The SQE format is as follows: `addr` Address of futex `fd` futex2(2) FUTEX2_* flags `futex_flags` io_uring specific command flags. None valid now. `addr2` Value of futex `addr3` Mask to wake/wait Acked-by: Peter Zijlstra (Intel) <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent e52c434 commit 194bb58

File tree

9 files changed

+317
-0
lines changed

9 files changed

+317
-0
lines changed

include/linux/io_uring_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,11 @@ struct io_ring_ctx {
321321

322322
struct hlist_head waitid_list;
323323

324+
#ifdef CONFIG_FUTEX
325+
struct hlist_head futex_list;
326+
struct io_alloc_cache futex_cache;
327+
#endif
328+
324329
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
325330
struct io_sq_data *sq_data; /* if using sq thread polling */
326331

include/uapi/linux/io_uring.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ struct io_uring_sqe {
6666
__u32 msg_ring_flags;
6767
__u32 uring_cmd_flags;
6868
__u32 waitid_flags;
69+
__u32 futex_flags;
6970
};
7071
__u64 user_data; /* data to be passed back at completion time */
7172
/* pack this to avoid bogus arm OABI complaints */
@@ -243,6 +244,8 @@ enum io_uring_op {
243244
IORING_OP_SENDMSG_ZC,
244245
IORING_OP_READ_MULTISHOT,
245246
IORING_OP_WAITID,
247+
IORING_OP_FUTEX_WAIT,
248+
IORING_OP_FUTEX_WAKE,
246249

247250
/* this goes last, obviously */
248251
IORING_OP_LAST,

io_uring/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
1010
cancel.o kbuf.o rsrc.o rw.o opdef.o \
1111
notif.o waitid.o
1212
obj-$(CONFIG_IO_WQ) += io-wq.o
13+
obj-$(CONFIG_FUTEX) += futex.o

io_uring/cancel.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "poll.h"
1717
#include "timeout.h"
1818
#include "waitid.h"
19+
#include "futex.h"
1920
#include "cancel.h"
2021

2122
struct io_cancel {
@@ -124,6 +125,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
124125
if (ret != -ENOENT)
125126
return ret;
126127

128+
ret = io_futex_cancel(ctx, cd, issue_flags);
129+
if (ret != -ENOENT)
130+
return ret;
131+
127132
spin_lock(&ctx->completion_lock);
128133
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
129134
ret = io_timeout_cancel(ctx, cd);

io_uring/cancel.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0
2+
#ifndef IORING_CANCEL_H
3+
#define IORING_CANCEL_H
24

35
#include <linux/io_uring_types.h>
46

@@ -22,3 +24,5 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
2224

2325
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
2426
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
27+
28+
#endif

io_uring/futex.c

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <linux/kernel.h>
3+
#include <linux/errno.h>
4+
#include <linux/fs.h>
5+
#include <linux/file.h>
6+
#include <linux/io_uring.h>
7+
8+
#include <uapi/linux/io_uring.h>
9+
10+
#include "../kernel/futex/futex.h"
11+
#include "io_uring.h"
12+
#include "rsrc.h"
13+
#include "futex.h"
14+
15+
struct io_futex {
16+
struct file *file;
17+
u32 __user *uaddr;
18+
unsigned long futex_val;
19+
unsigned long futex_mask;
20+
u32 futex_flags;
21+
};
22+
23+
struct io_futex_data {
24+
union {
25+
struct futex_q q;
26+
struct io_cache_entry cache;
27+
};
28+
struct io_kiocb *req;
29+
};
30+
31+
void io_futex_cache_init(struct io_ring_ctx *ctx)
32+
{
33+
io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
34+
sizeof(struct io_futex_data));
35+
}
36+
37+
static void io_futex_cache_entry_free(struct io_cache_entry *entry)
38+
{
39+
kfree(container_of(entry, struct io_futex_data, cache));
40+
}
41+
42+
void io_futex_cache_free(struct io_ring_ctx *ctx)
43+
{
44+
io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
45+
}
46+
47+
static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
48+
{
49+
struct io_futex_data *ifd = req->async_data;
50+
struct io_ring_ctx *ctx = req->ctx;
51+
52+
io_tw_lock(ctx, ts);
53+
if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
54+
kfree(ifd);
55+
req->async_data = NULL;
56+
hlist_del_init(&req->hash_node);
57+
io_req_task_complete(req, ts);
58+
}
59+
60+
static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
61+
{
62+
struct io_futex_data *ifd = req->async_data;
63+
64+
/* futex wake already done or in progress */
65+
if (!futex_unqueue(&ifd->q))
66+
return false;
67+
68+
hlist_del_init(&req->hash_node);
69+
io_req_set_res(req, -ECANCELED, 0);
70+
req->io_task_work.func = io_futex_complete;
71+
io_req_task_work_add(req);
72+
return true;
73+
}
74+
75+
int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
76+
unsigned int issue_flags)
77+
{
78+
struct hlist_node *tmp;
79+
struct io_kiocb *req;
80+
int nr = 0;
81+
82+
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
83+
return -ENOENT;
84+
85+
io_ring_submit_lock(ctx, issue_flags);
86+
hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
87+
if (req->cqe.user_data != cd->data &&
88+
!(cd->flags & IORING_ASYNC_CANCEL_ANY))
89+
continue;
90+
if (__io_futex_cancel(ctx, req))
91+
nr++;
92+
if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
93+
break;
94+
}
95+
io_ring_submit_unlock(ctx, issue_flags);
96+
97+
if (nr)
98+
return nr;
99+
100+
return -ENOENT;
101+
}
102+
103+
bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
104+
bool cancel_all)
105+
{
106+
struct hlist_node *tmp;
107+
struct io_kiocb *req;
108+
bool found = false;
109+
110+
lockdep_assert_held(&ctx->uring_lock);
111+
112+
hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) {
113+
if (!io_match_task_safe(req, task, cancel_all))
114+
continue;
115+
__io_futex_cancel(ctx, req);
116+
found = true;
117+
}
118+
119+
return found;
120+
}
121+
122+
int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
123+
{
124+
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
125+
u32 flags;
126+
127+
if (unlikely(sqe->len || sqe->futex_flags || sqe->buf_index ||
128+
sqe->file_index))
129+
return -EINVAL;
130+
131+
iof->uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
132+
iof->futex_val = READ_ONCE(sqe->addr2);
133+
iof->futex_mask = READ_ONCE(sqe->addr3);
134+
flags = READ_ONCE(sqe->fd);
135+
136+
if (flags & ~FUTEX2_VALID_MASK)
137+
return -EINVAL;
138+
139+
iof->futex_flags = futex2_to_flags(flags);
140+
if (!futex_flags_valid(iof->futex_flags))
141+
return -EINVAL;
142+
143+
if (!futex_validate_input(iof->futex_flags, iof->futex_val) ||
144+
!futex_validate_input(iof->futex_flags, iof->futex_mask))
145+
return -EINVAL;
146+
147+
return 0;
148+
}
149+
150+
static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
151+
{
152+
struct io_futex_data *ifd = container_of(q, struct io_futex_data, q);
153+
struct io_kiocb *req = ifd->req;
154+
155+
if (unlikely(!__futex_wake_mark(q)))
156+
return;
157+
158+
io_req_set_res(req, 0, 0);
159+
req->io_task_work.func = io_futex_complete;
160+
io_req_task_work_add(req);
161+
}
162+
163+
static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
164+
{
165+
struct io_cache_entry *entry;
166+
167+
entry = io_alloc_cache_get(&ctx->futex_cache);
168+
if (entry)
169+
return container_of(entry, struct io_futex_data, cache);
170+
171+
return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
172+
}
173+
174+
int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
175+
{
176+
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
177+
struct io_ring_ctx *ctx = req->ctx;
178+
struct io_futex_data *ifd = NULL;
179+
struct futex_hash_bucket *hb;
180+
int ret;
181+
182+
if (!iof->futex_mask) {
183+
ret = -EINVAL;
184+
goto done;
185+
}
186+
187+
io_ring_submit_lock(ctx, issue_flags);
188+
ifd = io_alloc_ifd(ctx);
189+
if (!ifd) {
190+
ret = -ENOMEM;
191+
goto done_unlock;
192+
}
193+
194+
req->async_data = ifd;
195+
ifd->q = futex_q_init;
196+
ifd->q.bitset = iof->futex_mask;
197+
ifd->q.wake = io_futex_wake_fn;
198+
ifd->req = req;
199+
200+
ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
201+
&ifd->q, &hb);
202+
if (!ret) {
203+
hlist_add_head(&req->hash_node, &ctx->futex_list);
204+
io_ring_submit_unlock(ctx, issue_flags);
205+
206+
futex_queue(&ifd->q, hb);
207+
return IOU_ISSUE_SKIP_COMPLETE;
208+
}
209+
210+
done_unlock:
211+
io_ring_submit_unlock(ctx, issue_flags);
212+
done:
213+
if (ret < 0)
214+
req_set_fail(req);
215+
io_req_set_res(req, ret, 0);
216+
kfree(ifd);
217+
return IOU_OK;
218+
}
219+
220+
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
221+
{
222+
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
223+
int ret;
224+
225+
/*
226+
* Strict flags - ensure that waking 0 futexes yields a 0 result.
227+
* See commit 43adf8449510 ("futex: FLAGS_STRICT") for details.
228+
*/
229+
ret = futex_wake(iof->uaddr, FLAGS_STRICT | iof->futex_flags,
230+
iof->futex_val, iof->futex_mask);
231+
if (ret < 0)
232+
req_set_fail(req);
233+
io_req_set_res(req, ret, 0);
234+
return IOU_OK;
235+
}

io_uring/futex.h

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
3+
#include "cancel.h"
4+
5+
int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
6+
int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags);
7+
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags);
8+
9+
#if defined(CONFIG_FUTEX)
10+
int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
11+
unsigned int issue_flags);
12+
bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
13+
bool cancel_all);
14+
void io_futex_cache_init(struct io_ring_ctx *ctx);
15+
void io_futex_cache_free(struct io_ring_ctx *ctx);
16+
#else
17+
static inline int io_futex_cancel(struct io_ring_ctx *ctx,
18+
struct io_cancel_data *cd,
19+
unsigned int issue_flags)
20+
{
21+
return 0;
22+
}
23+
static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
24+
struct task_struct *task, bool cancel_all)
25+
{
26+
return false;
27+
}
28+
static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
29+
{
30+
}
31+
static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
32+
{
33+
}
34+
#endif

io_uring/io_uring.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
#include "net.h"
9494
#include "notif.h"
9595
#include "waitid.h"
96+
#include "futex.h"
9697

9798
#include "timeout.h"
9899
#include "poll.h"
@@ -330,6 +331,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
330331
sizeof(struct async_poll));
331332
io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
332333
sizeof(struct io_async_msghdr));
334+
io_futex_cache_init(ctx);
333335
init_completion(&ctx->ref_comp);
334336
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
335337
mutex_init(&ctx->uring_lock);
@@ -350,6 +352,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
350352
ctx->submit_state.free_list.next = NULL;
351353
INIT_WQ_LIST(&ctx->locked_free_list);
352354
INIT_HLIST_HEAD(&ctx->waitid_list);
355+
#ifdef CONFIG_FUTEX
356+
INIT_HLIST_HEAD(&ctx->futex_list);
357+
#endif
353358
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
354359
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
355360
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
@@ -2895,6 +2900,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
28952900
io_eventfd_unregister(ctx);
28962901
io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
28972902
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
2903+
io_futex_cache_free(ctx);
28982904
io_destroy_buffers(ctx);
28992905
mutex_unlock(&ctx->uring_lock);
29002906
if (ctx->sq_creds)
@@ -3338,6 +3344,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
33383344
mutex_lock(&ctx->uring_lock);
33393345
ret |= io_poll_remove_all(ctx, task, cancel_all);
33403346
ret |= io_waitid_remove_all(ctx, task, cancel_all);
3347+
ret |= io_futex_remove_all(ctx, task, cancel_all);
33413348
ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
33423349
mutex_unlock(&ctx->uring_lock);
33433350
ret |= io_kill_timeouts(ctx, task, cancel_all);

0 commit comments

Comments
 (0)