Skip to content

Commit b9763b3

Browse files
threadpool: improve thread sync for new-graphs
Using the same tricks as ggml_barrier. All the polling is done with relaxed memory order to keep it efficient, once the new graph is detected we do full fence using read-modify-write with strict memory order.
1 parent c4411d5 commit b9763b3

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

ggml/src/ggml.c

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19963,12 +19963,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1996319963

1996419964
#ifndef GGML_USE_OPENMP
1996519965

19966+
// check if thread is active
1996619967
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
1996719968
struct ggml_threadpool * threadpool = state->threadpool;
1996819969
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
1996919970
return (state->ith < n_threads);
1997019971
}
1997119972

19973+
// check if thread is ready to proceed (exit from polling or sleeping)
1997219974
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
1997319975
struct ggml_threadpool * threadpool = state->threadpool;
1997419976

@@ -19984,6 +19986,14 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
1998419986
return state->pending;
1998519987
}
1998619988

19989+
// sync thread state after polling
19990+
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
19991+
struct ggml_threadpool * threadpool = state->threadpool;
19992+
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
19993+
// so instead we just use a dummy read-modify-write
19994+
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
19995+
}
19996+
1998719997
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
1998819998
struct ggml_threadpool * threadpool = state->threadpool;
1998919999

@@ -20008,6 +20018,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2000820018
struct ggml_threadpool * threadpool = state->threadpool;
2000920019

2001020020
if (ggml_graph_compute_poll_for_work(state)) {
20021+
ggml_graph_compute_thread_sync(state);
2001120022
return state->pending;
2001220023
}
2001320024

@@ -20063,7 +20074,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2006320074
// Start processing new graph
2006420075
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2006520076
{
20066-
// always take the mutex here because the worker threads are doing hybrid poll/wait
20077+
// Always take the mutex here because the worker threads are doing hybrid poll/wait
2006720078

2006820079
ggml_mutex_lock(&threadpool->mutex);
2006920080

@@ -20072,7 +20083,9 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
2007220083
// Update the number of active threads
2007320084
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
2007420085

20075-
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20086+
// Indicate the graph is ready to be processed
20087+
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
20088+
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
2007620089

2007720090
if (threadpool->pause) {
2007820091
// Update main thread prio and affinity to match the threadpool settings

0 commit comments

Comments
 (0)