-
Notifications
You must be signed in to change notification settings - Fork 19
Description
Here is a minimized testcase that causes uring
to segfault:
let rec consume uring =
match Uring.wait ~timeout:1.0 uring with
| None ->
()
| Some _ ->
consume uring
let sockets =
Array.init 500 (fun _ ->
let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 0 in
(s, Unix.ADDR_INET (Unix.inet_addr_of_string "127.0.0.1", 631)) )
let uring = Uring.create ~polling_timeout:1000 ~queue_depth:128 ()
let () =
let () =
sockets
|> Array.iter
@@ fun (sock, dest) ->
let rec loop () =
match Uring.connect uring sock dest 5 with
| Some _ ->
()
| None ->
let (_ : 'a Uring.completion_option) =
Uring.wait ~timeout:1.0 uring
in
loop ()
in
loop ()
in
while Uring.active_ops uring > 0 do
consume uring
done
Compile and run like this (assuming you have something listening on port 631 localhost, if not pick another port...):
$ ocamlfind ocamlopt -package unix,uring conn.ml -linkpkg -o conn
$ ./conn
[1] 112639 segmentation fault (core dumped) ./conn
Software and hardware environment:
$ ocamlopt --version
5.0.0
$ uname -a
Linux fedora38.fedora 6.2.14-300.fc38.x86_64 #1 SMP PREEMPT_DYNAMIC Mon May 1 00:55:28 UTC 2023 x86_64 GNU/Linux
$ inxi
CPU: 16-core AMD Ryzen 9 7950X (-MT MCP-) speed/min/max: 3131/3000/5880 MHz
Kernel: 6.2.14-300.fc38.x86_64 x86_64 Up: 10h 4m Mem: 5939.9/63423.8 MiB
(9.4%) Storage: 24.99 TiB (4.0% used) Procs: 536 Shell: Zsh inxi: 3.3.26
What I tried:
- changing the number of sockets from 500 to 50: no crash
- removing the '~timeout' flag from
Uring.wait
: no crash, but stuck - removing the
~polling_timeout
flag: no crash, but stuck - using the debug runtime: didn't provide more info about the crash
coredumpctl gdb
:
Core was generated by `_build/default/loadtest_uring.exe http://localhost:631'.
Program terminated with signal SIGSEGV, Segmentation fault.
warning: Section `.reg-xstate/113639' in core file too small.
#0 0x00000000004c7872 in ocaml_uring_wait_cqe_timeout (v_timeout=<optimized out>, v_uring=<optimized out>) at uring_stubs.c:688
688 id = (long)io_uring_cqe_get_data(cqe);
[Current thread is 1 (Thread 0x7f17e38da780 (LWP 113639))]
(gdb) bt
#0 0x00000000004c7872 in ocaml_uring_wait_cqe_timeout (v_timeout=<optimized out>, v_uring=<optimized out>) at uring_stubs.c:688
#1 <signal handler called>
#2 0x0000000000453007 in camlUring__fun_3000 () at uring.0.5/lib/uring/uring.ml:502
#3 0x0000000000452e15 in camlUring__fn_on_ring_2135 () at uring.0.5/lib/uring/uring.ml:484
#4 0x0000000000452f81 in camlUring__wait_2146 () at uring.0.5/lib/uring/uring.ml:502
#5 0x000000000044fc38 in camlDune__exe__Loadtest_uring__loop_589 ()
#6 0x0000000000487865 in camlStdlib__Array__iter_329 () at array.ml:92
#7 0x000000000044fd1a in camlDune__exe__Loadtest_uring__entry ()
#8 0x000000000044c940 in caml_program ()
#9 <signal handler called>
#10 0x00000000004f86cb in caml_startup_common (pooling=<optimized out>, argv=0x7ffe5f990e88) at runtime/startup_nat.c:129
#11 caml_startup_common (argv=0x7ffe5f990e88, pooling=<optimized out>) at runtime/startup_nat.c:85
#12 0x00000000004f875b in caml_startup_exn (argv=<optimized out>) at runtime/startup_nat.c:136
#13 caml_startup (argv=<optimized out>) at runtime/startup_nat.c:141
#14 caml_main (argv=<optimized out>) at runtime/startup_nat.c:148
#15 0x000000000044c52c in main (argc=<optimized out>, argv=<optimized out>) at runtime/main.c:37
(gdb) print cqe
$1 = (struct io_uring_cqe *) 0x0
The following change avoids the crash, but I don't know if it is correct (program still stuck, so there are likely still bugs in the way I've written it):
git diff
diff --git a/lib/uring/uring_stubs.c b/lib/uring/uring_stubs.c
index 8e050dc07e..ef02b49008 100644
--- a/lib/uring/uring_stubs.c
+++ b/lib/uring/uring_stubs.c
@@ -687,6 +687,8 @@ value ocaml_uring_wait_cqe_timeout(value v_timeout, value v_uring)
unix_error(-res, "io_uring_wait_cqe_timeout", Nothing);
}
} else {
+ if (!cqe)
+ CAMLreturn(Val_cqe_none);
id = (long)io_uring_cqe_get_data(cqe);
io_uring_cqe_seen(ring, cqe);
CAMLreturn(Val_cqe_some(Val_int(id), Val_int(cqe->res)));
(if I change number of sockets from 500 to 50 it doesn't stuck, doesn't crash and finishes).
In general the C stub appears to be very unsafe, and I think it'll take a lot more than the above change to make it safe even on a single threaded, one domain application (e.g. passing a Sockaddr_val
pointer to 'io_uring_prep_connect' which just puts the pointer into the ring for the kernel to read it later. But the OCaml GC may move that value from the minor heap to the major heap, causing the pointer to point to stale data, etc.)