Skip to content

Commit b1b968e

Browse files
authored
better signal handling (#55623)
Instead of relying on creating a fake stack frame, and having no signals delivered, kernel bugs, accidentally gc_collect, or other issues occur during the delivery and execution of these calls, use the ability we added recently to emulate a longjmp into a unw_context to eliminate any time where there would exist any invalid states. Secondly, when calling jl_exit_thread0_cb, we used to end up completely smashing the unwind info (with CFI_NOUNWIND), but this makes core files from SIGQUIT much less helpful, so we now have a `fake_stack_pop` function with contains the necessary CFI directives such that a minimal unwind from the debugger will likely still succeed up into the frames that were removed. We cannot do this perfectly on AArch64 since that platform's DWARF spec lacks the ability to do so. On other platforms, this should be possible to implement exactly (subject to libunwind implementation quality). This is currently thus only fully implemented for x86_64 on Darwin Apple.
1 parent 48b40ac commit b1b968e

File tree

10 files changed

+519
-412
lines changed

10 files changed

+519
-412
lines changed

src/jl_exported_funcs.inc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,6 @@
420420
XX(jl_set_zero_subnormals) \
421421
XX(jl_sigatomic_begin) \
422422
XX(jl_sigatomic_end) \
423-
XX(jl_sig_throw) \
424423
XX(jl_spawn) \
425424
XX(jl_specializations_get_linfo) \
426425
XX(jl_specializations_lookup) \

src/julia.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2310,7 +2310,6 @@ JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT;
23102310
JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSAFEPOINT;
23112311
JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED);
23122312
JL_DLLEXPORT void JL_NORETURN jl_rethrow(void);
2313-
JL_DLLEXPORT void JL_NORETURN jl_sig_throw(void);
23142313
JL_DLLEXPORT void JL_NORETURN jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED);
23152314
JL_DLLEXPORT void JL_NORETURN jl_no_exc_handler(jl_value_t *e, jl_task_t *ct);
23162315
JL_DLLEXPORT JL_CONST_FUNC jl_gcframe_t **(jl_get_pgcstack)(void) JL_GLOBALLY_ROOTED JL_NOTSAFEPOINT;

src/julia_threads.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ typedef struct _jl_tls_states_t {
187187
// Saved exception for previous *external* API call or NULL if cleared.
188188
// Access via jl_exception_occurred().
189189
struct _jl_value_t *previous_exception;
190+
#ifdef _OS_DARWIN_
191+
jl_jmp_buf *volatile safe_restore;
192+
#endif
190193

191194
// currently-held locks, to be released when an exception is thrown
192195
small_arraylist_t locks;

src/rtutils.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -269,25 +269,38 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_task_t *ct, jl_handler_t *eh)
269269
// `eh` may be not equal to `ct->eh`. See `jl_pop_handler`
270270
// This function should **NOT** have any safepoint before the ones at the
271271
// end.
272-
sig_atomic_t old_defer_signal = ct->ptls->defer_signal;
272+
jl_ptls_t ptls = ct->ptls;
273+
sig_atomic_t old_defer_signal = ptls->defer_signal;
273274
ct->eh = eh->prev;
274275
ct->gcstack = eh->gcstack;
275-
small_arraylist_t *locks = &ct->ptls->locks;
276+
small_arraylist_t *locks = &ptls->locks;
276277
int unlocks = locks->len > eh->locks_len;
277278
if (unlocks) {
278279
for (size_t i = locks->len; i > eh->locks_len; i--)
279280
jl_mutex_unlock_nogc((jl_mutex_t*)locks->items[i - 1]);
280281
locks->len = eh->locks_len;
281282
}
282283
ct->world_age = eh->world_age;
283-
ct->ptls->defer_signal = eh->defer_signal;
284-
int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
284+
ptls->defer_signal = eh->defer_signal;
285+
int8_t old_gc_state = jl_atomic_load_relaxed(&ptls->gc_state);
285286
if (old_gc_state != eh->gc_state)
286-
jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state);
287+
jl_atomic_store_release(&ptls->gc_state, eh->gc_state);
287288
if (!old_gc_state || !eh->gc_state) // it was or is unsafe now
288-
jl_gc_safepoint_(ct->ptls);
289+
jl_gc_safepoint_(ptls);
290+
jl_value_t *exception = ptls->sig_exception;
291+
if (exception) {
292+
int8_t oldstate = jl_gc_unsafe_enter(ptls);
293+
/* The temporary ptls->bt_data is rooted by special purpose code in the
294+
GC. This exists only for the purpose of preserving bt_data until we
295+
set ptls->bt_size=0 below. */
296+
jl_push_excstack(ct, &ct->excstack, exception,
297+
ptls->bt_data, ptls->bt_size);
298+
ptls->bt_size = 0;
299+
ptls->sig_exception = NULL;
300+
jl_gc_unsafe_leave(ptls, oldstate);
301+
}
289302
if (old_defer_signal && !eh->defer_signal)
290-
jl_sigint_safepoint(ct->ptls);
303+
jl_sigint_safepoint(ptls);
291304
if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers) &&
292305
unlocks && eh->locks_len == 0) {
293306
jl_gc_run_pending_finalizers(ct);

src/signals-mach.c

Lines changed: 95 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -222,38 +222,92 @@ typedef arm_exception_state64_t host_exception_state_t;
222222
#define HOST_EXCEPTION_STATE_COUNT ARM_EXCEPTION_STATE64_COUNT
223223
#endif
224224

225-
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
226-
void (*fptr)(void))
225+
// create a fake function that describes the variable manipulations in jl_call_in_state
226+
__attribute__((naked)) static void fake_stack_pop(void)
227227
{
228228
#ifdef _CPU_X86_64_
229-
uintptr_t rsp = state->__rsp;
229+
__asm__ volatile (
230+
" .cfi_signal_frame\n"
231+
" .cfi_def_cfa %rsp, 0\n" // CFA here uses %rsp directly
232+
" .cfi_offset %rip, 0\n" // previous value of %rip at CFA
233+
" .cfi_offset %rsp, 8\n" // previous value of %rsp at CFA
234+
" nop\n"
235+
);
230236
#elif defined(_CPU_AARCH64_)
231-
uintptr_t rsp = state->__sp;
237+
__asm__ volatile (
238+
" .cfi_signal_frame\n"
239+
" .cfi_def_cfa sp, 0\n" // use sp as fp here
240+
" .cfi_offset lr, 0\n"
241+
" .cfi_offset sp, 8\n"
242+
// Anything else got smashed, since we didn't explicitly copy all of the
243+
// state object to the stack (to build a real sigreturn frame).
244+
// This is also not quite valid, since the AArch64 DWARF spec lacks the ability to define how to restore the LR register correctly,
245+
// so normally libunwind implementations on linux detect this function specially and hack around the invalid info:
246+
// https://github.com/llvm/llvm-project/commit/c82deed6764cbc63966374baf9721331901ca958
247+
" nop\n"
248+
);
232249
#else
233-
#error "julia: throw-in-context not supported on this platform"
250+
CFI_NORETURN
234251
#endif
235-
if (ptls2 == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
236-
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
237-
}
238-
else {
239-
rsp = (uintptr_t)ptls2->signal_stack + (ptls2->signal_stack_size ? ptls2->signal_stack_size : sig_stack_size);
240-
}
241-
assert(rsp % 16 == 0);
242-
rsp -= 16;
252+
}
243253

254+
static void jl_call_in_state(host_thread_state_t *state, void (*fptr)(void))
255+
{
244256
#ifdef _CPU_X86_64_
245-
rsp -= sizeof(void*);
246-
state->__rsp = rsp; // set stack pointer
257+
uintptr_t sp = state->__rsp;
258+
#elif defined(_CPU_AARCH64_)
259+
uintptr_t sp = state->__sp;
260+
#endif
261+
sp = (sp - 256) & ~(uintptr_t)15; // redzone and re-alignment
262+
assert(sp % 16 == 0);
263+
sp -= 16;
264+
#ifdef _CPU_X86_64_
265+
// set return address to NULL
266+
*(uintptr_t*)sp = 0;
267+
// pushq %sp
268+
sp -= sizeof(void*);
269+
*(uintptr_t*)sp = state->__rsp;
270+
// pushq %rip
271+
sp -= sizeof(void*);
272+
*(uintptr_t*)sp = state->__rip;
273+
// pushq .fake_stack_pop + 1; aka call from fake_stack_pop
274+
sp -= sizeof(void*);
275+
*(uintptr_t*)sp = (uintptr_t)&fake_stack_pop + 1;
276+
state->__rsp = sp; // set stack pointer
247277
state->__rip = (uint64_t)fptr; // "call" the function
248278
#elif defined(_CPU_AARCH64_)
249-
state->__sp = rsp;
250-
state->__pc = (uint64_t)fptr;
251-
state->__lr = 0;
279+
// push {%sp, %pc + 4}
280+
sp -= sizeof(void*);
281+
*(uintptr_t*)sp = state->__sp;
282+
sp -= sizeof(void*);
283+
*(uintptr_t*)sp = (uintptr_t)state->__pc;
284+
state->__sp = sp; // x31
285+
state->__pc = (uint64_t)fptr; // pc
286+
state->__lr = (uintptr_t)&fake_stack_pop + 4; // x30
252287
#else
253288
#error "julia: throw-in-context not supported on this platform"
254289
#endif
255290
}
256291

292+
static void jl_longjmp_in_state(host_thread_state_t *state, jl_jmp_buf jmpbuf)
293+
{
294+
295+
if (!jl_simulate_longjmp(jmpbuf, (bt_context_t*)state)) {
296+
// for sanitizer builds, fallback to calling longjmp on the original stack
297+
// (this will fail for stack overflow, but that is hardly sanitizer-legal anyways)
298+
#ifdef _CPU_X86_64_
299+
state->__rdi = (uintptr_t)jmpbuf;
300+
state->__rsi = 1;
301+
#elif defined(_CPU_AARCH64_)
302+
state->__x[0] = (uintptr_t)jmpbuf;
303+
state->__x[1] = 1;
304+
#else
305+
#error "julia: jl_longjmp_in_state not supported on this platform"
306+
#endif
307+
jl_call_in_state(state, (void (*)(void))longjmp);
308+
}
309+
}
310+
257311
#ifdef _CPU_X86_64_
258312
int is_write_fault(host_exception_state_t exc_state) {
259313
return exc_reg_is_write_fault(exc_state.__err);
@@ -275,25 +329,36 @@ static void jl_throw_in_thread(jl_ptls_t ptls2, mach_port_t thread, jl_value_t *
275329
host_thread_state_t state;
276330
kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count);
277331
HANDLE_MACH_ERROR("thread_get_state", ret);
278-
if (1) { // XXX: !jl_has_safe_restore(ptls2)
332+
if (ptls2->safe_restore) {
333+
jl_longjmp_in_state(&state, *ptls2->safe_restore);
334+
}
335+
else {
279336
assert(exception);
280337
ptls2->bt_size =
281338
rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, (bt_context_t *)&state,
282-
NULL /*current_task?*/);
339+
NULL /*current_task?*/);
283340
ptls2->sig_exception = exception;
341+
ptls2->io_wait = 0;
342+
jl_task_t *ct = ptls2->current_task;
343+
jl_handler_t *eh = ct->eh;
344+
if (eh != NULL) {
345+
asan_unpoison_task_stack(ct, &eh->eh_ctx);
346+
jl_longjmp_in_state(&state, eh->eh_ctx);
347+
}
348+
else {
349+
jl_no_exc_handler(exception, ct);
350+
}
284351
}
285-
jl_call_in_state(ptls2, &state, &jl_sig_throw);
286352
ret = thread_set_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, count);
287353
HANDLE_MACH_ERROR("thread_set_state", ret);
288354
}
289355

290356
static void segv_handler(int sig, siginfo_t *info, void *context)
291357
{
292358
assert(sig == SIGSEGV || sig == SIGBUS);
293-
if (jl_get_safe_restore()) { // restarting jl_ or jl_unwind_stepn
294-
jl_task_t *ct = jl_get_current_task();
295-
jl_ptls_t ptls = ct == NULL ? NULL : ct->ptls;
296-
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
359+
jl_jmp_buf *saferestore = jl_get_safe_restore();
360+
if (saferestore) { // restarting jl_ or jl_unwind_stepn
361+
jl_longjmp_in_state((host_thread_state_t*)jl_to_bt_context(context), *saferestore);
297362
return;
298363
}
299364
jl_task_t *ct = jl_get_current_task();
@@ -354,12 +419,10 @@ kern_return_t catch_mach_exception_raise(
354419
jl_safe_printf("ERROR: Exception handler triggered on unmanaged thread.\n");
355420
return KERN_INVALID_ARGUMENT;
356421
}
357-
// XXX: jl_throw_in_thread or segv_handler will eventually check this, but
358-
// we would like to avoid some of this work if we could detect this earlier
359-
// if (jl_has_safe_restore(ptls2)) {
360-
// jl_throw_in_thread(ptls2, thread, NULL);
361-
// return KERN_SUCCESS;
362-
// }
422+
if (ptls2->safe_restore) {
423+
jl_throw_in_thread(ptls2, thread, NULL);
424+
return KERN_SUCCESS;
425+
}
363426
if (jl_atomic_load_acquire(&ptls2->gc_state) == JL_GC_STATE_WAITING)
364427
return KERN_FAILURE;
365428
if (exception == EXC_ARITHMETIC) {
@@ -518,7 +581,6 @@ static void jl_try_deliver_sigint(void)
518581

519582
static void JL_NORETURN jl_exit_thread0_cb(int signo)
520583
{
521-
CFI_NORETURN
522584
jl_critical_error(signo, 0, NULL, jl_current_task);
523585
jl_atexit_hook(128);
524586
jl_raise(signo);
@@ -550,7 +612,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
550612
#else
551613
#error Fill in first integer argument here
552614
#endif
553-
jl_call_in_state(ptls2, &state, (void (*)(void))&jl_exit_thread0_cb);
615+
jl_call_in_state(&state, (void (*)(void))&jl_exit_thread0_cb);
554616
unsigned int count = MACH_THREAD_STATE_COUNT;
555617
ret = thread_set_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, count);
556618
HANDLE_MACH_ERROR("thread_set_state", ret);

0 commit comments

Comments
 (0)