Restrict threadpool to CPU backend

fmz · fmz · commit cbab212a322b · 2024-05-28T12:35:20.000-07:00
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
 #endif
 
-    ggml_backend_graph_compute(ctx->backend, gf, NULL);
+    ggml_backend_graph_compute(ctx->backend, gf);
 
     // the last node is the embedding tensor
     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
@@ -92,14 +92,14 @@ extern "C" {
         void (*GGML_CALL synchronize)(ggml_backend_t backend);
 
         // compute graph with a plan (not used currently)
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph, ggml_compute_threadpool_t  threadpool);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph with a plan
         enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph without a plan (async)
-        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph, ggml_compute_threadpool_t threadpool);
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
         bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
diff --git a/ggml-backend.c b/ggml-backend.c
@@ -255,13 +255,12 @@ void ggml_backend_synchronize(ggml_backend_t backend) {
 }
 
 ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
-               ggml_backend_t   backend,
-     const struct ggml_cgraph * cgraph,
-    ggml_compute_threadpool_t   threadpool
+              ggml_backend_t   backend,
+    const struct ggml_cgraph * cgraph
 ) {
     GGML_ASSERT(backend->iface.graph_plan_create != NULL);
 
-    return backend->iface.graph_plan_create(backend, cgraph, threadpool);
+    return backend->iface.graph_plan_create(backend, cgraph);
 }
 
 void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
@@ -281,20 +280,18 @@ enum ggml_status ggml_backend_graph_plan_compute(
 
 enum ggml_status ggml_backend_graph_compute(
                ggml_backend_t   backend,
-           struct ggml_cgraph * cgraph,
-    ggml_compute_threadpool_t   threadpool
+           struct ggml_cgraph * cgraph
 ) {
-    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph, threadpool);
+    enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
     ggml_backend_synchronize(backend);
     return err;
 }
 
 enum ggml_status ggml_backend_graph_compute_async(
-               ggml_backend_t   backend,
-           struct ggml_cgraph * cgraph,
-    ggml_compute_threadpool_t   threadpool
+        ggml_backend_t   backend,
+    struct ggml_cgraph * cgraph
 ) {
-    return backend->iface.graph_compute(backend, cgraph, threadpool);
+    return backend->iface.graph_compute(backend, cgraph);
 }
 
 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@@ -741,7 +738,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 #endif
 
 struct ggml_backend_cpu_context {
-    int n_threads;
+    int                       n_threads;
+    ggml_compute_threadpool_t threadpool;
+
     void * work_data;
     size_t work_size;
 
@@ -774,15 +773,14 @@ struct ggml_backend_plan_cpu {
 };
 
 GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(
-               ggml_backend_t   backend,
-     const struct ggml_cgraph * cgraph,
-    ggml_compute_threadpool_t   threadpool
+              ggml_backend_t   backend,
+    const struct ggml_cgraph * cgraph
 ) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
 
     if (cpu_plan->cplan.work_size > 0) {
@@ -817,13 +815,12 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 }
 
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(
-               ggml_backend_t   backend,
-           struct ggml_cgraph * cgraph,
-    ggml_compute_threadpool_t   threadpool
+        ggml_backend_t   backend,
+    struct ggml_cgraph * cgraph
 ) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, threadpool);
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
     if (cpu_ctx->work_size < cplan.work_size) {
         free(cpu_ctx->work_data);
@@ -892,6 +889,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     }
 
     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
+    ctx->threadpool          = NULL;
     ctx->work_data           = NULL;
     ctx->work_size           = 0;
     ctx->abort_callback      = NULL;
@@ -922,6 +920,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+    ctx->threadpool = threadpool;
+}
+
 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
 
@@ -1653,10 +1658,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
     return true;
 }
 
-static enum ggml_status ggml_backend_sched_compute_splits(
-         ggml_backend_sched_t sched,
-    ggml_compute_threadpool_t threadpool
-) {
+static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
     struct ggml_backend_sched_split * splits = sched->splits;
 
     for (int i = 0; i < sched->n_splits; i++) {
@@ -1690,7 +1692,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
         }
 
         if (!sched->callback_eval) {
-            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph, threadpool);
+            enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
             if (ec != GGML_STATUS_SUCCESS) {
                 return ec;
             }
@@ -1712,7 +1714,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(
 
                 struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
 
-                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv, threadpool);
+                enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
                 if (ec != GGML_STATUS_SUCCESS) {
                     return ec;
                 }
@@ -1852,19 +1854,17 @@ bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgra
 }
 
 enum ggml_status ggml_backend_sched_graph_compute(
-         ggml_backend_sched_t   sched,
-           struct ggml_cgraph * graph,
-    ggml_compute_threadpool_t   threadpool
+    ggml_backend_sched_t   sched,
+      struct ggml_cgraph * graph
 ) {
-    enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph, threadpool);
+    enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
     ggml_backend_sched_synchronize(sched);
     return err;
 }
 
 enum ggml_status ggml_backend_sched_graph_compute_async(
-         ggml_backend_sched_t   sched,
-           struct ggml_cgraph * graph,
-    ggml_compute_threadpool_t   threadpool
+    ggml_backend_sched_t   sched,
+      struct ggml_cgraph * graph
 ) {
     if (!sched->is_reset && !sched->is_alloc) {
         ggml_backend_sched_reset(sched);
@@ -1876,7 +1876,7 @@ enum ggml_status ggml_backend_sched_graph_compute_async(
         }
     }
 
-    return ggml_backend_sched_compute_splits(sched, threadpool);
+    return ggml_backend_sched_compute_splits(sched);
 }
 
 void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
@@ -2115,8 +2115,8 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
         struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
         struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
 
-        ggml_backend_graph_compute(backend1, &g1v, NULL);
-        ggml_backend_graph_compute(backend2, &g2v, NULL);
+        ggml_backend_graph_compute(backend1, &g1v);
+        ggml_backend_graph_compute(backend2, &g2v);
 
         if (ggml_is_view_op(t1->op)) {
             continue;
diff --git a/ggml-backend.h b/ggml-backend.h
@@ -69,8 +69,7 @@ extern "C" {
 
     GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(
                    ggml_backend_t   backend,
-         const struct ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool);
+         const struct ggml_cgraph * cgraph);
 
     GGML_API void ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
@@ -79,12 +78,10 @@ extern "C" {
         ggml_backend_graph_plan_t plan);
     GGML_API enum ggml_status ggml_backend_graph_compute(
                    ggml_backend_t   backend,
-               struct ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool);
+               struct ggml_cgraph * cgraph);
     GGML_API enum ggml_status ggml_backend_graph_compute_async(
                    ggml_backend_t   backend,
-               struct ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool);
+               struct ggml_cgraph * cgraph);
     GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
     GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
 
@@ -112,6 +109,7 @@ extern "C" {
 
     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
 
     // Create a backend buffer from an existing pointer
@@ -205,8 +203,8 @@ extern "C" {
 
     // Allocate and compute graph on the backend scheduler
     GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
-    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph, ggml_compute_threadpool_t threadpool);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
     GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
 
     // Reset all assignments and allocators - must be called before changing the node backends
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -2495,13 +2495,9 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
     return true;
 }
 
-GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(
-                   ggml_backend_t   backend,
-                      ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool) {
-
-    GGML_UNUSED(threadpool);
+GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
     ggml_cuda_set_device(cuda_ctx->device);
 
 #ifdef USE_CUDA_GRAPH
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
@@ -1948,12 +1948,7 @@ static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(g
     return ggml_backend_kompute_buffer_type(ctx->device);
 }
 
-static ggml_status ggml_backend_kompute_graph_compute(
-                  ggml_backend_t   backend,
-              struct ggml_cgraph * cgraph
-       ggml_compute_threadpool_t   threadpool) {
-
-    GGML_UNUSED(threadpool);
+static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
     ggml_vk_graph_compute(ctx, cgraph);
     return GGML_STATUS_SUCCESS;
diff --git a/ggml-metal.m b/ggml-metal.m
@@ -3103,12 +3103,7 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
     UNUSED(backend);
 }
 
-GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(
-                   ggml_backend_t   backend,
-               struct ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool) {
-
-    UNUSED(threadpool);
+GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
     return ggml_metal_graph_compute(metal_ctx, cgraph);
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
@@ -2235,12 +2235,7 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
     GGML_UNUSED(backend);
 }
 
-static ggml_status ggml_backend_opencl_graph_compute(
-                   ggml_backend_t   backend,
-                      ggml_cgraph * graph,
-        ggml_compute_threadpool_t   threadpool) {
-
-    GGML_UNUSED(threadpool);
+static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
     for (int i = 0; i < graph->n_nodes; ++i) {
         ggml_tensor * node = graph->nodes[i];
 
diff --git a/ggml-rpc.cpp b/ggml-rpc.cpp
@@ -585,8 +585,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
     memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
 }
 
-GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph, ggml_compute_threadpool * tp) {
-    UNUSED(tp);
+GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
     std::vector<uint8_t> input;
     serialize_graph(cgraph, input);
@@ -1021,7 +1020,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
     for (uint32_t i = 0; i < n_nodes; i++) {
         graph->nodes[i] = create_node(nodes[i], ctx, tensor_ptrs, tensor_map);
     }
-    ggml_status status = ggml_backend_graph_compute(backend, graph, NULL);
+    ggml_status status = ggml_backend_graph_compute(backend, graph);
     // output serialization format: | status (1 byte) |
     output.resize(1, 0);
     output[0] = status;
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
@@ -17022,13 +17022,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(
-                   ggml_backend_t   backend,
-                      ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool) {
-
-    GGML_UNUSED(threadpool);
-
+GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
     ggml_sycl_set_main_device(sycl_ctx->device);
 
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
@@ -6225,12 +6225,7 @@ static bool ggml_vk_is_empty(ggml_tensor * node) {
     return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
 }
 
-GGML_CALL static ggml_status ggml_backend_vk_graph_compute(
-                   ggml_backend_t   backend,
-                      ggml_cgraph * cgraph,
-        ggml_compute_threadpool_t   threadpool) {
-
-    GGML_UNUSED(threadpool);
+GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
 #endif
diff --git a/ggml.c b/ggml.c
@@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str
         int32_t base_idx = *iter;
         for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) {
             int32_t idx = base_idx + i;
-            if (idx > GGML_N_CORES_MAX) {
+            if (idx >= GGML_N_CORES_MAX) {
                 // Just a cheaper modulo
                 idx -= GGML_N_CORES_MAX;
             }
diff --git a/ggml.h b/ggml.h
@@ -2051,7 +2051,7 @@ extern "C" {
                   const struct ggml_cgraph * cgraph,
                                        int   n_threads,
             struct ggml_compute_threadpool * threadpool);
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
     GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
diff --git a/llama.cpp b/llama.cpp
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

Original file line number	Diff line number	Diff line change
`@@ -1915,7 +1915,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima`
`1915`	`1915`	`}`
`1916`	`1916`	`#endif`
`1917`	`1917`
`1918`		`- ggml_backend_graph_compute(ctx->backend, gf, NULL);`
	`1918`	`+ ggml_backend_graph_compute(ctx->backend, gf);`
`1919`	`1919`
`1920`	`1920`	`// the last node is the embedding tensor`
`1921`	`1921`	`struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];`
Original file line number	Diff line number	Diff line change
`@@ -19501,7 +19501,7 @@ static void __cpumask_next(const bool * global_mask, bool * local_mask, bool str`
`19501`	`19501`	`int32_t base_idx = *iter;`
`19502`	`19502`	`for (int32_t i = 0; i < GGML_N_CORES_MAX; i++) {`
`19503`	`19503`	`int32_t idx = base_idx + i;`
`19504`		`- if (idx > GGML_N_CORES_MAX) {`
	`19504`	`+ if (idx >= GGML_N_CORES_MAX) {`
`19505`	`19505`	`// Just a cheaper modulo`
`19506`	`19506`	`idx -= GGML_N_CORES_MAX;`
`19507`	`19507`	`}`