Skip to content

Commit 056c47d

Browse files
committed
Reapply "threadpool : skip polling for unused threads (ggml-org#9461)"
This reverts commit 2a8dbf8.
1 parent 8d789ac commit 056c47d

File tree

3 files changed

+168
-48
lines changed

3 files changed

+168
-48
lines changed

ggml/src/ggml.c

Lines changed: 74 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2014,10 +2014,11 @@ struct ggml_threadpool {
20142014
// these are atomic as an annotation for thread-sanitizer
20152015
atomic_bool stop; // Used for stopping the threadpool altogether
20162016
atomic_bool pause; // Used for pausing the threadpool or individual threads
2017+
atomic_bool abort; // Used for aborting processing of a graph
20172018

20182019
struct ggml_compute_state * workers; // per thread state
20192020
int n_threads_max; // number of threads in the pool
2020-
int n_threads_cur; // number of threads used in the current graph
2021+
atomic_int n_threads_cur; // number of threads used in the current graph
20212022

20222023
int32_t prio; // Scheduling priority
20232024
uint32_t poll; // Polling level (0 - no polling)
@@ -3181,41 +3182,36 @@ inline static void ggml_critical_section_start(void) {
31813182
}
31823183
}
31833184

3184-
#ifdef GGML_USE_OPENMP
3185-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3186-
if (threadpool->n_threads_cur == 1) {
3185+
static void ggml_barrier(struct ggml_threadpool * tp) {
3186+
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
3187+
if (n_threads == 1) {
31873188
return;
31883189
}
31893190

3191+
#ifdef GGML_USE_OPENMP
31903192
#pragma omp barrier
3191-
}
31923193
#else
3193-
static void ggml_barrier(struct ggml_threadpool * threadpool) {
3194-
if (threadpool->n_threads_cur == 1) {
3195-
return;
3196-
}
3197-
3198-
atomic_int * n_barrier = &threadpool->n_barrier;
3199-
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
3194+
int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);
32003195

3201-
int n_threads = threadpool->n_threads_cur;
3202-
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
3196+
// enter barrier (full seq-cst fence)
3197+
int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);
32033198

3204-
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
3199+
int last = 0;
3200+
if (n_barrier == (n_threads - 1)) {
32053201
// last thread
3206-
atomic_store(n_barrier, 0);
3207-
atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
3202+
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
3203+
last = 1;
32083204
} else {
32093205
// wait for other threads
3210-
while (true) {
3211-
if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
3212-
return;
3213-
}
3206+
while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
32143207
ggml_thread_cpu_relax();
32153208
}
32163209
}
3217-
}
3210+
3211+
// exit barrier (full seq-cst fence)
3212+
atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
32183213
#endif
3214+
}
32193215

32203216
// TODO: make this somehow automatically executed
32213217
// some sort of "sentry" mechanism
@@ -20185,64 +20181,84 @@ struct ggml_cplan ggml_graph_plan(
2018520181

2018620182
static thread_ret_t ggml_graph_compute_thread(void * data) {
2018720183
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
20184+
struct ggml_threadpool * tp = state->threadpool;
2018820185

20189-
const struct ggml_cgraph * cgraph = state->threadpool->cgraph;
20190-
const struct ggml_cplan * cplan = state->threadpool->cplan;
20186+
const struct ggml_cgraph * cgraph = tp->cgraph;
20187+
const struct ggml_cplan * cplan = tp->cplan;
2019120188

2019220189
set_numa_thread_affinity(state->ith);
2019320190

2019420191
struct ggml_compute_params params = {
2019520192
/*.ith =*/ state->ith,
20196-
/*.nth =*/ state->threadpool->n_threads_cur,
20193+
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
2019720194
/*.wsize =*/ cplan->work_size,
2019820195
/*.wdata =*/ cplan->work_data,
20199-
/*.threadpool=*/ state->threadpool,
20196+
/*.threadpool=*/ tp,
2020020197
};
2020120198

20202-
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
20199+
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
2020320200
struct ggml_tensor * node = cgraph->nodes[node_n];
2020420201

2020520202
ggml_compute_forward(&params, node);
2020620203

20207-
if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
20208-
state->threadpool->ec = GGML_STATUS_ABORTED;
20204+
if (state->ith == 0 && cplan->abort_callback &&
20205+
cplan->abort_callback(cplan->abort_callback_data)) {
20206+
tp->abort = true;
20207+
tp->ec = GGML_STATUS_ABORTED;
2020920208
}
2021020209

2021120210
ggml_barrier(state->threadpool);
20212-
20213-
if (state->threadpool->ec != GGML_STATUS_SUCCESS) {
20214-
break;
20215-
}
2021620211
}
2021720212

2021820213
return 0;
2021920214
}
2022020215

2022120216
#ifndef GGML_USE_OPENMP
2022220217

20223-
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
20218+
// check if thread is active
20219+
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
20220+
struct ggml_threadpool * threadpool = state->threadpool;
20221+
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
20222+
return (state->ith < n_threads);
20223+
}
20224+
20225+
// check if thread is ready to proceed (exit from polling or sleeping)
20226+
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
2022420227
struct ggml_threadpool * threadpool = state->threadpool;
2022520228

2022620229
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
2022720230

2022820231
// check for new graph/work
2022920232
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
2023020233
if (new_graph != state->last_graph) {
20231-
state->pending = (state->ith < threadpool->n_threads_cur);
20234+
state->pending = ggml_graph_compute_thread_active(state);
2023220235
state->last_graph = new_graph;
2023320236
}
2023420237

2023520238
return state->pending;
2023620239
}
2023720240

20241+
// sync thread state after polling
20242+
static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * state) {
20243+
struct ggml_threadpool * threadpool = state->threadpool;
20244+
// this should just be atomic_thread_fence(seq_cst) but it confuses thread-sanitizer
20245+
// so instead we just use a dummy read-modify-write
20246+
atomic_fetch_add_explicit(&threadpool->n_graph, 0, memory_order_seq_cst);
20247+
}
20248+
2023820249
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
2023920250
struct ggml_threadpool * threadpool = state->threadpool;
2024020251

20252+
// Skip polling for unused threads
20253+
if (!ggml_graph_compute_thread_active(state)) {
20254+
return state->pending;
20255+
}
20256+
2024120257
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
2024220258
// Perhaps, we can adjust it dynamically based on load and things.
2024320259
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
2024420260

20245-
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
20261+
for (uint64_t i=0; !ggml_graph_compute_thread_ready(state) && i < n_rounds; i++) {
2024620262
// No new work. Keep polling.
2024720263
ggml_thread_cpu_relax();
2024820264
}
@@ -20254,13 +20270,14 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state *
2025420270
struct ggml_threadpool * threadpool = state->threadpool;
2025520271

2025620272
if (ggml_graph_compute_poll_for_work(state)) {
20273+
ggml_graph_compute_thread_sync(state);
2025720274
return state->pending;
2025820275
}
2025920276

2026020277
ggml_mutex_lock_shared(&threadpool->mutex);
20261-
while (!ggml_graph_compute_ready(state)) {
20278+
while (!ggml_graph_compute_thread_ready(state)) {
2026220279
// No new work. Wait for the signal.
20263-
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
20280+
GGML_PRINT_DEBUG("thread #%d waiting for work (sleeping)\n", state->ith);
2026420281
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
2026520282
}
2026620283
ggml_mutex_unlock_shared(&threadpool->mutex);
@@ -20307,13 +20324,20 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
2030720324
}
2030820325

2030920326
// Start processing new graph
20310-
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool)
20327+
static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int n_threads)
2031120328
{
20312-
// always take the mutex here because the worker threads are doing hybrid poll/wait
20329+
// Always take the mutex here because the worker threads are doing hybrid poll/wait
2031320330

2031420331
ggml_mutex_lock(&threadpool->mutex);
2031520332

20316-
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
20333+
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
20334+
20335+
// Update the number of active threads
20336+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
20337+
20338+
// Indicate the graph is ready to be processed
20339+
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
20340+
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
2031720341

2031820342
if (threadpool->pause) {
2031920343
// Update main thread prio and affinity to match the threadpool settings
@@ -20372,6 +20396,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
2037220396
threadpool->current_chunk = 0;
2037320397
threadpool->stop = false;
2037420398
threadpool->pause = tpp->paused;
20399+
threadpool->abort = false;
2037520400
threadpool->workers = NULL;
2037620401
threadpool->n_threads_max = tpp->n_threads;
2037720402
threadpool->n_threads_cur = tpp->n_threads;
@@ -20447,15 +20472,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2044720472
// No worker threads should be accessing the parameters below at this stage
2044820473
threadpool->cgraph = cgraph;
2044920474
threadpool->cplan = cplan;
20450-
threadpool->n_threads_cur = n_threads;
2045120475
threadpool->current_chunk = 0;
20476+
threadpool->abort = false;
2045220477
threadpool->ec = GGML_STATUS_SUCCESS;
2045320478
}
2045420479

20455-
if (n_threads > threadpool->n_threads_max) {
20456-
GGML_PRINT("WARNING: cplan is requesting more threads than the threadpool contains. Expect a bad time!\n");
20457-
}
20458-
2045920480
#ifdef GGML_USE_OPENMP
2046020481
if (n_threads > 1) {
2046120482
#pragma omp parallel num_threads(n_threads)
@@ -20464,7 +20485,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2046420485
{
2046520486
// update the number of threads from the actual number of threads that we got from OpenMP
2046620487
n_threads = omp_get_num_threads();
20467-
threadpool->n_threads_cur = n_threads;
20488+
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
2046820489
}
2046920490

2047020491
ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
@@ -20474,8 +20495,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2047420495
ggml_graph_compute_thread(&threadpool->workers[0]);
2047520496
}
2047620497
#else
20498+
if (n_threads > threadpool->n_threads_max) {
20499+
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
20500+
n_threads = threadpool->n_threads_max;
20501+
}
20502+
2047720503
// Kick all threads to start the new graph
20478-
ggml_graph_compute_kickoff(threadpool);
20504+
ggml_graph_compute_kickoff(threadpool, n_threads);
2047920505

2048020506
// This is a work thread too
2048120507
ggml_graph_compute_thread(&threadpool->workers[0]);

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ llama_target_and_test(test-grammar-parser.cpp)
119119
llama_target_and_test(test-llama-grammar.cpp)
120120
llama_target_and_test(test-grammar-integration.cpp)
121121
llama_target_and_test(test-grad0.cpp)
122+
llama_target_and_test(test-barrier.cpp)
122123
# llama_target_and_test(test-opt.cpp) # SLOW
123124
llama_target_and_test(test-backend-ops.cpp)
124125

tests/test-barrier.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#include "ggml.h"
2+
#include "ggml-backend.h"
3+
4+
#include <chrono>
5+
#include <iostream>
6+
#include <cstdio>
7+
#include <cstdlib>
8+
#include <cassert>
9+
#include <vector>
10+
11+
#define MAX_NARGS 2
12+
13+
int main(int argc, char *argv[]) {
14+
15+
int n_threads = 4;
16+
int n_rounds = 100;
17+
18+
if (argc > 1) {
19+
n_threads = std::atoi(argv[1]);
20+
}
21+
22+
if (argc > 2) {
23+
n_rounds = std::atoi(argv[2]);
24+
}
25+
26+
struct ggml_init_params params = {
27+
/* .mem_size = */ 1024*1024*1024,
28+
/* .mem_buffer = */ NULL,
29+
/* .no_alloc = */ false,
30+
};
31+
32+
struct ggml_context * ctx = ggml_init(params);
33+
34+
// Create graph
35+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
36+
37+
// Lots of small, parallel ops where barriers in between will dominate
38+
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
39+
for (int i = 0; i < 1000; i++) {
40+
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
41+
out = ggml_mul_mat(ctx, a, out);
42+
43+
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
44+
out = ggml_mul_mat(ctx, d, out);
45+
}
46+
47+
ggml_build_forward_expand(gf, out);
48+
int n_nodes = ggml_graph_n_nodes(gf);
49+
50+
// Create threadpool
51+
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
52+
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
53+
if (!threadpool) {
54+
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
55+
exit(1);
56+
}
57+
58+
// Create compute plan
59+
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
60+
61+
std::vector<uint8_t> work_data(cplan.work_size);
62+
cplan.work_data = work_data.data();
63+
64+
std::cerr << "graph-compute with"
65+
<< "\n n_threads: " << n_threads
66+
<< "\n n_nodes: " << n_nodes
67+
<< "\n n_rounds: " << n_rounds
68+
<< "\n";
69+
// ggml_graph_print(gf);
70+
71+
// Warmup
72+
ggml_graph_compute(gf, &cplan);
73+
74+
auto t0 = std::chrono::high_resolution_clock::now();
75+
76+
for (int i=0; i < n_rounds; i++) {
77+
ggml_graph_compute(gf, &cplan);
78+
}
79+
80+
auto t1 = std::chrono::high_resolution_clock::now();
81+
82+
auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
83+
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
84+
std::cerr << "graph-compute took " << usec << " usec "
85+
<< "\n " << (float) usec / n_rounds << " usec per-iter"
86+
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
87+
<< "\n";
88+
89+
ggml_threadpool_free(threadpool);
90+
ggml_free(ctx);
91+
92+
return 0;
93+
}

0 commit comments

Comments
 (0)