@@ -19963,12 +19963,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19963
19963
19964
19964
#ifndef GGML_USE_OPENMP
19965
19965
19966
+ // check if thread is active
19966
19967
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
19967
19968
struct ggml_threadpool * threadpool = state->threadpool;
19968
19969
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
19969
19970
return (state->ith < n_threads);
19970
19971
}
19971
19972
19973
+ // check if thread is ready to proceed (exit from polling or sleeping)
19972
19974
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
19973
19975
struct ggml_threadpool * threadpool = state->threadpool;
19974
19976
@@ -19984,6 +19986,14 @@ static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * s
19984
19986
return state->pending;
19985
19987
}
19986
19988
19989
+ // sync thread state after polling
19990
+ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
19991
+ struct ggml_threadpool * threadpool = state->threadpool;
19992
+ // this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
19993
+ // so instead we just use a dummy read-modify-write
19994
+ atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
19995
+ }
19996
+
19987
19997
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
19988
19998
struct ggml_threadpool * threadpool = state->threadpool;
19989
19999
@@ -20008,6 +20018,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
20008
20018
struct ggml_threadpool * threadpool = state->threadpool;
20009
20019
20010
20020
if (ggml_graph_compute_poll_for_work(state)) {
20021
+ ggml_graph_compute_thread_sync(state);
20011
20022
return state->pending;
20012
20023
}
20013
20024
@@ -20063,7 +20074,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
20063
20074
// Start processing new graph
20064
20075
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
20065
20076
{
20066
- // always take the mutex here because the worker threads are doing hybrid poll/wait
20077
+ // Always take the mutex here because the worker threads are doing hybrid poll/wait
20067
20078
20068
20079
ggml_mutex_lock(&threadpool->mutex);
20069
20080
@@ -20072,7 +20083,9 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
20072
20083
// Update the number of active threads
20073
20084
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
20074
20085
20075
- atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20086
+ // Indicate the graph is ready to be processed
20087
+ // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
20088
+ atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
20076
20089
20077
20090
if (threadpool->pause) {
20078
20091
// Update main thread prio and affinity to match the threadpool settings
0 commit comments