Skip to content

Commit a2f8e47

Browse files
committed
better signal handling
Instead of relying on creating a fake stack frame, and having no signals delivered, kernel bugs, accidentally gc_collect, or other issues occur during the delivery and execution of these calls, use the ability we added recently to emulate a longjmp into a unw_context to eliminate any time where there would exist any invalid states. Secondly, when calling jl_exit_thread0_cb, we used to end up completely smashing the unwind info (with CFI_NOUNWIND), but this makes core files from SIGQUIT much less helpful, so we now have a `fake_stack_pop` function with contains the necessary CFI directives such that a minimal unwind from the debugger will likely still succeed up into the frames that were removed. We cannot do this perfectly on AArch64 since that platform's DWARF spec lacks the ability to do so. On other platforms, this should be possible to implement exactly (subject to libunwind implementation quality).
1 parent da3468c commit a2f8e47

File tree

10 files changed

+505
-411
lines changed

10 files changed

+505
-411
lines changed

src/jl_exported_funcs.inc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,6 @@
420420
XX(jl_set_zero_subnormals) \
421421
XX(jl_sigatomic_begin) \
422422
XX(jl_sigatomic_end) \
423-
XX(jl_sig_throw) \
424423
XX(jl_spawn) \
425424
XX(jl_specializations_get_linfo) \
426425
XX(jl_specializations_lookup) \

src/julia.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2310,7 +2310,6 @@ JL_DLLEXPORT int jl_set_task_tid(jl_task_t *task, int16_t tid) JL_NOTSAFEPOINT;
23102310
JL_DLLEXPORT int jl_set_task_threadpoolid(jl_task_t *task, int8_t tpid) JL_NOTSAFEPOINT;
23112311
JL_DLLEXPORT void JL_NORETURN jl_throw(jl_value_t *e JL_MAYBE_UNROOTED);
23122312
JL_DLLEXPORT void JL_NORETURN jl_rethrow(void);
2313-
JL_DLLEXPORT void JL_NORETURN jl_sig_throw(void);
23142313
JL_DLLEXPORT void JL_NORETURN jl_rethrow_other(jl_value_t *e JL_MAYBE_UNROOTED);
23152314
JL_DLLEXPORT void JL_NORETURN jl_no_exc_handler(jl_value_t *e, jl_task_t *ct);
23162315
JL_DLLEXPORT JL_CONST_FUNC jl_gcframe_t **(jl_get_pgcstack)(void) JL_GLOBALLY_ROOTED JL_NOTSAFEPOINT;

src/julia_threads.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,9 @@ typedef struct _jl_tls_states_t {
187187
// Saved exception for previous *external* API call or NULL if cleared.
188188
// Access via jl_exception_occurred().
189189
struct _jl_value_t *previous_exception;
190+
#ifdef _OS_DARWIN_
191+
jl_jmp_buf *volatile safe_restore;
192+
#endif
190193

191194
// currently-held locks, to be released when an exception is thrown
192195
small_arraylist_t locks;

src/rtutils.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -269,25 +269,38 @@ JL_DLLEXPORT void jl_eh_restore_state(jl_task_t *ct, jl_handler_t *eh)
269269
// `eh` may be not equal to `ct->eh`. See `jl_pop_handler`
270270
// This function should **NOT** have any safepoint before the ones at the
271271
// end.
272-
sig_atomic_t old_defer_signal = ct->ptls->defer_signal;
272+
jl_ptls_t ptls = ct->ptls;
273+
sig_atomic_t old_defer_signal = ptls->defer_signal;
273274
ct->eh = eh->prev;
274275
ct->gcstack = eh->gcstack;
275-
small_arraylist_t *locks = &ct->ptls->locks;
276+
small_arraylist_t *locks = &ptls->locks;
276277
int unlocks = locks->len > eh->locks_len;
277278
if (unlocks) {
278279
for (size_t i = locks->len; i > eh->locks_len; i--)
279280
jl_mutex_unlock_nogc((jl_mutex_t*)locks->items[i - 1]);
280281
locks->len = eh->locks_len;
281282
}
282283
ct->world_age = eh->world_age;
283-
ct->ptls->defer_signal = eh->defer_signal;
284-
int8_t old_gc_state = jl_atomic_load_relaxed(&ct->ptls->gc_state);
284+
ptls->defer_signal = eh->defer_signal;
285+
int8_t old_gc_state = jl_atomic_load_relaxed(&ptls->gc_state);
285286
if (old_gc_state != eh->gc_state)
286-
jl_atomic_store_release(&ct->ptls->gc_state, eh->gc_state);
287+
jl_atomic_store_release(&ptls->gc_state, eh->gc_state);
287288
if (!old_gc_state || !eh->gc_state) // it was or is unsafe now
288-
jl_gc_safepoint_(ct->ptls);
289+
jl_gc_safepoint_(ptls);
290+
jl_value_t *exception = ptls->sig_exception;
291+
if (exception) {
292+
int8_t oldstate = jl_gc_unsafe_enter(ptls);
293+
/* The temporary ptls->bt_data is rooted by special purpose code in the
294+
GC. This exists only for the purpose of preserving bt_data until we
295+
set ptls->bt_size=0 below. */
296+
jl_push_excstack(ct, &ct->excstack, exception,
297+
ptls->bt_data, ptls->bt_size);
298+
ptls->bt_size = 0;
299+
ptls->sig_exception = NULL;
300+
jl_gc_unsafe_leave(ptls, oldstate);
301+
}
289302
if (old_defer_signal && !eh->defer_signal)
290-
jl_sigint_safepoint(ct->ptls);
303+
jl_sigint_safepoint(ptls);
291304
if (jl_atomic_load_relaxed(&jl_gc_have_pending_finalizers) &&
292305
unlocks && eh->locks_len == 0) {
293306
jl_gc_run_pending_finalizers(ct);

src/signals-mach.c

Lines changed: 83 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -222,38 +222,81 @@ typedef arm_exception_state64_t host_exception_state_t;
222222
#define HOST_EXCEPTION_STATE_COUNT ARM_EXCEPTION_STATE64_COUNT
223223
#endif
224224

225-
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
226-
void (*fptr)(void))
225+
// create a fake function that describes the variable manipulations in jl_call_in_state
226+
__attribute__((naked)) static void fake_stack_pop(void)
227227
{
228228
#ifdef _CPU_X86_64_
229-
uintptr_t rsp = state->__rsp;
229+
__asm__ volatile (
230+
" .cfi_signal_frame\n"
231+
" .cfi_def_cfa %rsp, 0\n" // CFA here uses %rsp directly
232+
" .cfi_offset %rip, 0\n" // previous value of %rip at CFA
233+
" .cfi_offset %rsp, 8\n" // previous value of %rsp at CFA
234+
" nop\n"
235+
);
230236
#elif defined(_CPU_AARCH64_)
231-
uintptr_t rsp = state->__sp;
237+
__asm__ volatile (
238+
" .cfi_signal_frame\n"
239+
" .cfi_def_cfa sp, 0\n" // use sp as fp here
240+
" .cfi_offset lr, 0\n"
241+
" .cfi_offset sp, 8\n"
242+
// Anything else got smashed, since we didn't explictly copy all of the
243+
// state object to the stack (to build a real sigreturn frame).
244+
// This is also not quite valid, since the AArch64 DWARF spec lacks the ability to define how to restore the LR reigster correctly,
245+
// so normally libunwind implementations on linux detect this function specially and hack around the invalid info:
246+
// https://github.com/llvm/llvm-project/commit/c82deed6764cbc63966374baf9721331901ca958
247+
" nop\n"
248+
);
232249
#else
233-
#error "julia: throw-in-context not supported on this platform"
250+
CFI_NORETURN
234251
#endif
235-
if (ptls2 == NULL || is_addr_on_sigstack(ptls2, (void*)rsp)) {
236-
rsp = (rsp - 256) & ~(uintptr_t)15; // redzone and re-alignment
237-
}
238-
else {
239-
rsp = (uintptr_t)ptls2->signal_stack + (ptls2->signal_stack_size ? ptls2->signal_stack_size : sig_stack_size);
240-
}
241-
assert(rsp % 16 == 0);
242-
rsp -= 16;
252+
}
243253

254+
static void jl_call_in_state(jl_ptls_t ptls2, host_thread_state_t *state,
255+
void (*fptr)(void))
256+
{
257+
#ifdef _CPU_X86_64_
258+
uintptr_t sp = state->__rsp;
259+
#elif defined(_CPU_AARCH64_)
260+
uintptr_t sp = state->__sp;
261+
#endif
262+
sp = (sp - 256) & ~(uintptr_t)15; // redzone and re-alignment
263+
assert(sp % 16 == 0);
264+
sp -= 16;
244265
#ifdef _CPU_X86_64_
245-
rsp -= sizeof(void*);
246-
state->__rsp = rsp; // set stack pointer
266+
// set return address to NULL
267+
*(uintptr_t*)sp = 0;
268+
// pushq %sp
269+
sp -= sizeof(void*);
270+
*(uintptr_t*)sp = state->__rsp;
271+
// pushq %rip
272+
sp -= sizeof(void*);
273+
*(uintptr_t*)sp = state->__rip;
274+
// pushq .fake_stack_pop + 1; aka call from fake_stack_pop
275+
sp -= sizeof(void*);
276+
*(uintptr_t*)sp = (uintptr_t)&fake_stack_pop + 1;
277+
state->__rsp = sp; // set stack pointer
247278
state->__rip = (uint64_t)fptr; // "call" the function
248279
#elif defined(_CPU_AARCH64_)
249-
state->__sp = rsp;
250-
state->__pc = (uint64_t)fptr;
251-
state->__lr = 0;
280+
// push {%sp, %pc + 4}
281+
sp -= sizeof(void*);
282+
*(uintptr_t*)sp = state->__sp;
283+
sp -= sizeof(void*);
284+
*(uintptr_t*)sp = (uintptr_t)state->__pc;
285+
state->__sp = sp; // x31
286+
state->__pc = (uint64_t)fptr; // pc
287+
state->__lr = (uintptr_t)&fake_stack_pop + 4; // x30
252288
#else
253289
#error "julia: throw-in-context not supported on this platform"
254290
#endif
255291
}
256292

293+
static void jl_longjmp_in_state(host_thread_state_t *state, jl_jmp_buf jmpbuf)
294+
{
295+
296+
if (!jl_simulate_longjmp(jmpbuf, (bt_context_t*)state))
297+
abort(); // unreachable
298+
}
299+
257300
#ifdef _CPU_X86_64_
258301
int is_write_fault(host_exception_state_t exc_state) {
259302
return exc_reg_is_write_fault(exc_state.__err);
@@ -275,25 +318,36 @@ static void jl_throw_in_thread(jl_ptls_t ptls2, mach_port_t thread, jl_value_t *
275318
host_thread_state_t state;
276319
kern_return_t ret = thread_get_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, &count);
277320
HANDLE_MACH_ERROR("thread_get_state", ret);
278-
if (1) { // XXX: !jl_has_safe_restore(ptls2)
321+
if (ptls2->safe_restore) {
322+
jl_longjmp_in_state(&state, *ptls2->safe_restore);
323+
}
324+
else {
279325
assert(exception);
280326
ptls2->bt_size =
281327
rec_backtrace_ctx(ptls2->bt_data, JL_MAX_BT_SIZE, (bt_context_t *)&state,
282-
NULL /*current_task?*/);
328+
NULL /*current_task?*/);
283329
ptls2->sig_exception = exception;
330+
ptls2->io_wait = 0;
331+
jl_task_t *ct = ptls2->current_task;
332+
jl_handler_t *eh = ct->eh;
333+
if (eh != NULL) {
334+
asan_unpoison_task_stack(ct, &eh->eh_ctx);
335+
jl_longjmp_in_state(&state, eh->eh_ctx);
336+
}
337+
else {
338+
jl_no_exc_handler(exception, ct);
339+
}
284340
}
285-
jl_call_in_state(ptls2, &state, &jl_sig_throw);
286341
ret = thread_set_state(thread, MACH_THREAD_STATE, (thread_state_t)&state, count);
287342
HANDLE_MACH_ERROR("thread_set_state", ret);
288343
}
289344

290345
static void segv_handler(int sig, siginfo_t *info, void *context)
291346
{
292347
assert(sig == SIGSEGV || sig == SIGBUS);
293-
if (jl_get_safe_restore()) { // restarting jl_ or jl_unwind_stepn
294-
jl_task_t *ct = jl_get_current_task();
295-
jl_ptls_t ptls = ct == NULL ? NULL : ct->ptls;
296-
jl_call_in_state(ptls, (host_thread_state_t*)jl_to_bt_context(context), &jl_sig_throw);
348+
jl_jmp_buf *saferestore = jl_get_safe_restore();
349+
if (saferestore) { // restarting jl_ or jl_unwind_stepn
350+
jl_longjmp_in_state((host_thread_state_t*)jl_to_bt_context(context), *saferestore);
297351
return;
298352
}
299353
jl_task_t *ct = jl_get_current_task();
@@ -352,12 +406,10 @@ kern_return_t catch_mach_exception_raise(
352406
jl_safe_printf("ERROR: Exception handler triggered on unmanaged thread.\n");
353407
return KERN_INVALID_ARGUMENT;
354408
}
355-
// XXX: jl_throw_in_thread or segv_handler will eventually check this, but
356-
// we would like to avoid some of this work if we could detect this earlier
357-
// if (jl_has_safe_restore(ptls2)) {
358-
// jl_throw_in_thread(ptls2, thread, NULL);
359-
// return KERN_SUCCESS;
360-
// }
409+
if (ptls2->safe_restore) {
410+
jl_throw_in_thread(ptls2, thread, NULL);
411+
return KERN_SUCCESS;
412+
}
361413
if (jl_atomic_load_acquire(&ptls2->gc_state) == JL_GC_STATE_WAITING)
362414
return KERN_FAILURE;
363415
if (exception == EXC_ARITHMETIC) {
@@ -516,7 +568,6 @@ static void jl_try_deliver_sigint(void)
516568

517569
static void JL_NORETURN jl_exit_thread0_cb(int signo)
518570
{
519-
CFI_NORETURN
520571
jl_critical_error(signo, 0, NULL, jl_current_task);
521572
jl_atexit_hook(128);
522573
jl_raise(signo);

0 commit comments

Comments
 (0)