Skip to content

Commit 0f7f762

Browse files
[backports-release-1.11] 1.11 Profiling threading fix backports (#56358)
Fixes #56327
2 parents 6be0afc + 0495045 commit 0f7f762

File tree

2 files changed

+56
-14
lines changed

2 files changed

+56
-14
lines changed

src/signals-unix.c

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,11 @@ pthread_mutex_t in_signal_lock; // shared with jl_delete_thread
410410
static bt_context_t *signal_context; // protected by in_signal_lock
411411
static int exit_signal_cond = -1;
412412
static int signal_caught_cond = -1;
413+
static int signals_inflight = 0;
413414

414415
int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
415416
{
417+
int err;
416418
pthread_mutex_lock(&in_signal_lock);
417419
jl_ptls_t ptls2 = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
418420
jl_task_t *ct2 = ptls2 ? jl_atomic_load_relaxed(&ptls2->current_task) : NULL;
@@ -421,24 +423,45 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
421423
pthread_mutex_unlock(&in_signal_lock);
422424
return 0;
423425
}
424-
sig_atomic_t request = 0;
425-
if (!jl_atomic_cmpswap(&ptls2->signal_request, &request, 1)) {
426+
while (signals_inflight) {
426427
// something is wrong, or there is already a usr2 in flight elsewhere
427-
pthread_mutex_unlock(&in_signal_lock);
428-
return 0;
428+
// try to wait for it to finish or wait for timeout
429+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
430+
do {
431+
err = poll(&event, 1, timeout * 1000);
432+
} while (err == -1 && errno == EINTR);
433+
if (err == -1 || (event.revents & POLLIN) == 0) {
434+
// not ready after timeout: cancel this request
435+
pthread_mutex_unlock(&in_signal_lock);
436+
return 0;
437+
}
438+
// consume it before continuing
439+
eventfd_t got;
440+
do {
441+
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
442+
} while (err == -1 && errno == EINTR);
443+
if (err != sizeof(eventfd_t)) abort();
444+
assert(signals_inflight >= got);
445+
signals_inflight -= got;
429446
}
447+
signals_inflight++;
448+
sig_atomic_t request = jl_atomic_exchange(&ptls2->signal_request, 1);
449+
assert(request == 0 || request == -1);
430450
request = 1;
431-
int err = pthread_kill(ptls2->system_id, SIGUSR2);
432-
// wait for thread to acknowledge or timeout
433-
struct pollfd event = {signal_caught_cond, POLLIN, 0};
451+
err = pthread_kill(ptls2->system_id, SIGUSR2);
434452
if (err == 0) {
453+
// wait for thread to acknowledge or timeout
454+
struct pollfd event = {signal_caught_cond, POLLIN, 0};
435455
do {
436456
err = poll(&event, 1, timeout * 1000);
437457
} while (err == -1 && errno == EINTR);
458+
if (err != 1 || (event.revents & POLLIN) == 0)
459+
err = -1;
438460
}
439-
if ((event.revents & POLLIN) == 0) {
461+
if (err == -1) {
440462
// not ready after timeout: try to cancel this request
441463
if (jl_atomic_cmpswap(&ptls2->signal_request, &request, 0)) {
464+
signals_inflight--;
442465
pthread_mutex_unlock(&in_signal_lock);
443466
return 0;
444467
}
@@ -448,11 +471,13 @@ int jl_thread_suspend_and_get_state(int tid, int timeout, bt_context_t *ctx)
448471
err = read(signal_caught_cond, &got, sizeof(eventfd_t));
449472
} while (err == -1 && errno == EINTR);
450473
if (err != sizeof(eventfd_t)) abort();
451-
assert(got == 1); (void) got;
474+
assert(signals_inflight >= got);
475+
signals_inflight -= got;
476+
signals_inflight++;
452477
// Now the other thread is waiting on exit_signal_cond (verify that here by
453478
// checking it is 0, and add an acquire barrier for good measure)
454479
request = jl_atomic_load_acquire(&ptls2->signal_request);
455-
assert(request == 0); (void) request;
480+
assert(request == 0 || request == -1); (void) request;
456481
jl_atomic_store_release(&ptls2->signal_request, 4); // prepare to resume normally, but later code may change this
457482
*ctx = *signal_context;
458483
return 1;
@@ -475,6 +500,7 @@ static void jl_try_deliver_sigint(void)
475500
jl_safepoint_enable_sigint();
476501
jl_wake_libuv();
477502
pthread_mutex_lock(&in_signal_lock);
503+
signals_inflight++;
478504
jl_atomic_store_release(&ptls2->signal_request, 2);
479505
// This also makes sure `sleep` is aborted.
480506
pthread_kill(ptls2->system_id, SIGUSR2);
@@ -511,6 +537,7 @@ static void jl_exit_thread0(int signo, jl_bt_element_t *bt_data, size_t bt_size)
511537
}
512538

513539
// request:
540+
// -1: processing
514541
// 0: nothing [not from here]
515542
// 1: get state & wait for request
516543
// 2: throw sigint if `!defer_signal && io_wait` or if force throw threshold
@@ -526,22 +553,36 @@ void usr2_handler(int sig, siginfo_t *info, void *ctx)
526553
if (ptls == NULL)
527554
return;
528555
int errno_save = errno;
529-
// acknowledge that we saw the signal_request
530-
sig_atomic_t request = jl_atomic_exchange(&ptls->signal_request, 0);
556+
sig_atomic_t request = jl_atomic_load(&ptls->signal_request);
557+
if (request == 0)
558+
return;
559+
if (!jl_atomic_cmpswap(&ptls->signal_request, &request, -1))
560+
return;
531561
if (request == 1) {
532562
signal_context = jl_to_bt_context(ctx);
563+
// acknowledge that we saw the signal_request and set signal_context
533564
int err;
534565
eventfd_t got = 1;
535566
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
536567
if (err != sizeof(eventfd_t)) abort();
568+
sig_atomic_t processing = -1;
569+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
570+
// wait for exit signal
537571
do {
538572
err = read(exit_signal_cond, &got, sizeof(eventfd_t));
539573
} while (err == -1 && errno == EINTR);
540574
if (err != sizeof(eventfd_t)) abort();
541575
assert(got == 1);
542-
request = jl_atomic_exchange(&ptls->signal_request, 0);
576+
request = jl_atomic_exchange(&ptls->signal_request, -1);
577+
signal_context = NULL;
543578
assert(request == 2 || request == 3 || request == 4);
544579
}
580+
int err;
581+
eventfd_t got = 1;
582+
err = write(signal_caught_cond, &got, sizeof(eventfd_t));
583+
if (err != sizeof(eventfd_t)) abort();
584+
sig_atomic_t processing = -1;
585+
jl_atomic_cmpswap(&ptls->signal_request, &processing, 0);
545586
if (request == 2) {
546587
int force = jl_check_force_sigint();
547588
if (force || (!ptls->defer_signal && ptls->io_wait)) {

stdlib/Profile/test/runtests.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ let cmd = Base.julia_cmd()
168168
println("done")
169169
print(Profile.len_data())
170170
"""
171-
p = open(`$cmd -e $script`)
171+
# use multiple threads here to ensure that profiling works with threading
172+
p = open(`$cmd -t2 -e $script`)
172173
t = Timer(120) do t
173174
# should be under 10 seconds, so give it 2 minutes then report failure
174175
println("KILLING debuginfo registration test BY PROFILE TEST WATCHDOG\n")

0 commit comments

Comments
 (0)