Skip to content

Commit cbab212

Browse files
author
fmz
committed
Restrict threadpool to CPU backend
1 parent 1d9d39a commit cbab212

15 files changed

+61
-93
lines changed

examples/llava/clip.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
19151915
}
19161916
#endif
19171917

1918-
ggml_backend_graph_compute(ctx->backend, gf, NULL);
1918+
ggml_backend_graph_compute(ctx->backend, gf);
19191919

19201920
// the last node is the embedding tensor
19211921
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

ggml-backend-impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,14 @@ extern "C" {
9292
void (*GGML_CALL synchronize)(ggml_backend_t backend);
9393

9494
// compute graph with a plan (not used currently)
95-
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool);
95+
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
9696
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
9797

9898
// compute graph with a plan
9999
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
100100

101101
// compute graph without a plan (async)
102-
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool);
102+
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
103103

104104
// check if the backend supports an operation
105105
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

ggml-backend.c

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -255,13 +255,12 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
255255
}
256256

257257
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
258-
ggml_backend_t backend,
259-
const struct ggml_cgraph * cgraph,
260-
ggml_compute_threadpool_t threadpool
258+
ggml_backend_t backend,
259+
const struct ggml_cgraph * cgraph
261260
) {
262261
GGML_ASSERT(backend->iface.graph_plan_create != NULL);
263262

264-
return backend->iface.graph_plan_create(backend, cgraph, threadpool);
263+
return backend->iface.graph_plan_create(backend, cgraph);
265264
}
266265

267266
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
@@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute(
281280

282281
enum ggml_status ggml_backend_graph_compute(
283282
ggml_backend_t backend,
284-
struct ggml_cgraph * cgraph,
285-
ggml_compute_threadpool_t threadpool
283+
struct ggml_cgraph * cgraph
286284
) {
287-
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool);
285+
enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
288286
ggml_backend_synchronize(backend);
289287
return err;
290288
}
291289

292290
enum ggml_status ggml_backend_graph_compute_async(
293-
ggml_backend_t backend,
294-
struct ggml_cgraph * cgraph,
295-
ggml_compute_threadpool_t threadpool
291+
ggml_backend_t backend,
292+
struct ggml_cgraph * cgraph
296293
) {
297-
return backend->iface.graph_compute(backend, cgraph, threadpool);
294+
return backend->iface.graph_compute(backend, cgraph);
298295
}
299296

300297
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -741,7 +738,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
741738
#endif
742739

743740
struct ggml_backend_cpu_context {
744-
int n_threads;
741+
int n_threads;
742+
ggml_compute_threadpool_t threadpool;
743+
745744
void * work_data;
746745
size_t work_size;
747746

@@ -774,15 +773,14 @@ struct ggml_backend_plan_cpu {
774773
};
775774

776775
GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(
777-
ggml_backend_t backend,
778-
const struct ggml_cgraph * cgraph,
779-
ggml_compute_threadpool_t threadpool
776+
ggml_backend_t backend,
777+
const struct ggml_cgraph * cgraph
780778
) {
781779
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
782780

783781
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
784782

785-
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
783+
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
786784
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
787785

788786
if (cpu_plan->cplan.work_size > 0) {
@@ -817,13 +815,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
817815
}
818816

819817
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(
820-
ggml_backend_t backend,
821-
struct ggml_cgraph * cgraph,
822-
ggml_compute_threadpool_t threadpool
818+
ggml_backend_t backend,
819+
struct ggml_cgraph * cgraph
823820
) {
824821
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
825822

826-
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
823+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
827824

828825
if (cpu_ctx->work_size < cplan.work_size) {
829826
free(cpu_ctx->work_data);
@@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
892889
}
893890

894891
ctx->n_threads = GGML_DEFAULT_N_THREADS;
892+
ctx->threadpool = NULL;
895893
ctx->work_data = NULL;
896894
ctx->work_size = 0;
897895
ctx->abort_callback = NULL;
@@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
922920
ctx->n_threads = n_threads;
923921
}
924922

923+
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
924+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
925+
926+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
927+
ctx->threadpool = threadpool;
928+
}
929+
925930
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
926931
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
927932

@@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
16531658
return true;
16541659
}
16551660

1656-
static enum ggml_status ggml_backend_sched_compute_splits(
1657-
ggml_backend_sched_t sched,
1658-
ggml_compute_threadpool_t threadpool
1659-
) {
1661+
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
16601662
struct ggml_backend_sched_split * splits = sched->splits;
16611663

16621664
for (int i = 0; i < sched->n_splits; i++) {
@@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
16901692
}
16911693

16921694
if (!sched->callback_eval) {
1693-
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool);
1695+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
16941696
if (ec != GGML_STATUS_SUCCESS) {
16951697
return ec;
16961698
}
@@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
17121714

17131715
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
17141716

1715-
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool);
1717+
enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
17161718
if (ec != GGML_STATUS_SUCCESS) {
17171719
return ec;
17181720
}
@@ -1852,19 +1854,17 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
18521854
}
18531855

18541856
enum ggml_status ggml_backend_sched_graph_compute(
1855-
ggml_backend_sched_t sched,
1856-
struct ggml_cgraph * graph,
1857-
ggml_compute_threadpool_t threadpool
1857+
ggml_backend_sched_t sched,
1858+
struct ggml_cgraph * graph
18581859
) {
1859-
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool);
1860+
enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
18601861
ggml_backend_sched_synchronize(sched);
18611862
return err;
18621863
}
18631864

18641865
enum ggml_status ggml_backend_sched_graph_compute_async(
1865-
ggml_backend_sched_t sched,
1866-
struct ggml_cgraph * graph,
1867-
ggml_compute_threadpool_t threadpool
1866+
ggml_backend_sched_t sched,
1867+
struct ggml_cgraph * graph
18681868
) {
18691869
if (!sched->is_reset && !sched->is_alloc) {
18701870
ggml_backend_sched_reset(sched);
@@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(
18761876
}
18771877
}
18781878

1879-
return ggml_backend_sched_compute_splits(sched, threadpool);
1879+
return ggml_backend_sched_compute_splits(sched);
18801880
}
18811881

18821882
void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
@@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
21152115
struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
21162116
struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
21172117

2118-
ggml_backend_graph_compute(backend1, &g1v, NULL);
2119-
ggml_backend_graph_compute(backend2, &g2v, NULL);
2118+
ggml_backend_graph_compute(backend1, &g1v);
2119+
ggml_backend_graph_compute(backend2, &g2v);
21202120

21212121
if (ggml_is_view_op(t1->op)) {
21222122
continue;

ggml-backend.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,7 @@ extern "C" {
6969

7070
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
7171
ggml_backend_t backend,
72-
const struct ggml_cgraph * cgraph,
73-
ggml_compute_threadpool_t threadpool);
72+
const struct ggml_cgraph * cgraph);
7473

7574
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
7675

@@ -79,12 +78,10 @@ extern "C" {
7978
ggml_backend_graph_plan_t plan);
8079
GGML_API enum ggml_status ggml_backend_graph_compute(
8180
ggml_backend_t backend,
82-
struct ggml_cgraph * cgraph,
83-
ggml_compute_threadpool_t threadpool);
81+
struct ggml_cgraph * cgraph);
8482
GGML_API enum ggml_status ggml_backend_graph_compute_async(
8583
ggml_backend_t backend,
86-
struct ggml_cgraph * cgraph,
87-
ggml_compute_threadpool_t threadpool);
84+
struct ggml_cgraph * cgraph);
8885
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
8986
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
9087

@@ -112,6 +109,7 @@ extern "C" {
112109

113110
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
114111
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
112+
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
115113
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
116114

117115
// Create a backend buffer from an existing pointer
@@ -205,8 +203,8 @@ extern "C" {
205203

206204
// Allocate and compute graph on the backend scheduler
207205
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
208-
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
209-
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
206+
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
207+
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
210208
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
211209

212210
// Reset all assignments and allocators - must be called before changing the node backends

ggml-cuda.cu

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2495,13 +2495,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
24952495
return true;
24962496
}
24972497

2498-
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(
2499-
ggml_backend_t backend,
2500-
ggml_cgraph * cgraph,
2501-
ggml_compute_threadpool_t threadpool) {
2502-
2503-
GGML_UNUSED(threadpool);
2498+
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
25042499
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
2500+
25052501
ggml_cuda_set_device(cuda_ctx->device);
25062502

25072503
#ifdef USE_CUDA_GRAPH

ggml-kompute.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1948,12 +1948,7 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
19481948
return ggml_backend_kompute_buffer_type(ctx->device);
19491949
}
19501950

1951-
static ggml_status ggml_backend_kompute_graph_compute(
1952-
ggml_backend_t backend,
1953-
struct ggml_cgraph * cgraph
1954-
ggml_compute_threadpool_t threadpool) {
1955-
1956-
GGML_UNUSED(threadpool);
1951+
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
19571952
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
19581953
ggml_vk_graph_compute(ctx, cgraph);
19591954
return GGML_STATUS_SUCCESS;

ggml-metal.m

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3103,12 +3103,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
31033103
UNUSED(backend);
31043104
}
31053105

3106-
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(
3107-
ggml_backend_t backend,
3108-
struct ggml_cgraph * cgraph,
3109-
ggml_compute_threadpool_t threadpool) {
3110-
3111-
UNUSED(threadpool);
3106+
GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
31123107
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
31133108

31143109
return ggml_metal_graph_compute(metal_ctx, cgraph);

ggml-opencl.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2235,12 +2235,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
22352235
GGML_UNUSED(backend);
22362236
}
22372237

2238-
static ggml_status ggml_backend_opencl_graph_compute(
2239-
ggml_backend_t backend,
2240-
ggml_cgraph * graph,
2241-
ggml_compute_threadpool_t threadpool) {
2242-
2243-
GGML_UNUSED(threadpool);
2238+
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
22442239
for (int i = 0; i < graph->n_nodes; ++i) {
22452240
ggml_tensor * node = graph->nodes[i];
22462241

ggml-rpc.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -585,8 +585,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
585585
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
586586
}
587587

588-
GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) {
589-
UNUSED(tp);
588+
GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
590589
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
591590
std::vector<uint8_t> input;
592591
serialize_graph(cgraph, input);
@@ -1021,7 +1020,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
10211020
for (uint32_t i = 0; i < n_nodes; i++) {
10221021
graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
10231022
}
1024-
ggml_status status = ggml_backend_graph_compute(backend, graph, NULL);
1023+
ggml_status status = ggml_backend_graph_compute(backend, graph);
10251024
// output serialization format: | status (1 byte) |
10261025
output.resize(1, 0);
10271026
output[0] = status;

ggml-sycl.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17022,13 +17022,7 @@ catch (sycl::exception const &exc) {
1702217022
std::exit(1);
1702317023
}
1702417024

17025-
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(
17026-
ggml_backend_t backend,
17027-
ggml_cgraph * cgraph,
17028-
ggml_compute_threadpool_t threadpool) {
17029-
17030-
GGML_UNUSED(threadpool);
17031-
17025+
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
1703217026
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
1703317027
ggml_sycl_set_main_device(sycl_ctx->device);
1703417028

ggml-vulkan.cpp

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6225,12 +6225,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
62256225
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
62266226
}
62276227

6228-
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(
6229-
ggml_backend_t backend,
6230-
ggml_cgraph * cgraph,
6231-
ggml_compute_threadpool_t threadpool) {
6232-
6233-
GGML_UNUSED(threadpool);
6228+
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
62346229
#ifdef GGML_VULKAN_DEBUG
62356230
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
62366231
#endif

ggml.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str
1950119501
int32_t base_idx = *iter;
1950219502
for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) {
1950319503
int32_t idx = base_idx + i;
19504-
if (idx > GGML_N_CORES_MAX) {
19504+
if (idx >= GGML_N_CORES_MAX) {
1950519505
// Just a cheaper modulo
1950619506
idx -= GGML_N_CORES_MAX;
1950719507
}

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2051,7 +2051,7 @@ extern "C" {
20512051
const struct ggml_cgraph * cgraph,
20522052
int n_threads,
20532053
struct ggml_compute_threadpool * threadpool);
2054-
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2054+
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
20552055
// same as ggml_graph_compute() but the work data is allocated as a part of the context
20562056
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
20572057
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

0 commit comments

Comments
 (0)