@@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;
19334
19334
19335
19335
#endif
19336
19336
19337
+ #ifdef GGML_NO_OMP
19338
+
19339
+
19337
19340
// Android's libc implementation "bionic" does not support setting affinity
19338
19341
#if defined(__gnu_linux__)
19342
+
19339
19343
static void set_numa_thread_affinity(int thread_n) {
19340
19344
if (!ggml_is_numa()) {
19341
19345
return;
@@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {
19401
19405
19402
19406
CPU_FREE(cpus);
19403
19407
}
19408
+
19404
19409
#else
19405
19410
// TODO: Windows etc.
19406
19411
// (the linux implementation may also work on BSD, someone should test)
19407
19412
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
19408
19413
static void clear_numa_thread_affinity(void) {}
19414
+
19415
+ #endif
19416
+
19417
+
19409
19418
#endif
19410
19419
19411
19420
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
19713
19722
19714
19723
const int n_threads = state->shared->n_threads;
19715
19724
19725
+ #ifdef GGML_NO_OMP
19716
19726
set_numa_thread_affinity(state->ith);
19727
+ #endif
19717
19728
19718
19729
int node_n = -1;
19719
19730
int task_phase = GGML_TASK_TYPE_FINALIZE;
@@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
20086
20097
};
20087
20098
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
20088
20099
20089
- // create thread pool
20090
- if (n_threads > 1) {
20091
- for (int j = 1; j < n_threads; ++j) {
20092
- workers[j] = (struct ggml_compute_state) {
20093
- .thrd = 0,
20094
- .ith = j,
20095
- .shared = &state_shared,
20096
- .ec = GGML_STATUS_SUCCESS,
20097
- };
20100
+ const int64_t perf_start_cycles = ggml_perf_cycles();
20101
+ const int64_t perf_start_time_us = ggml_perf_time_us();
20098
20102
20103
+ /* Loop is reversed as in the NO_OMP case we want threads to start
20104
+ before the main thread (j==0) */
20105
+ #pragma omp parallel for shared(workers,state_shared)
20106
+ for (int j = n_threads - 1; 0 <= j; j--) {
20107
+ workers[j] = (struct ggml_compute_state) {
20108
+ .ith = j,
20109
+ .shared = &state_shared,
20110
+ .ec = GGML_STATUS_SUCCESS,
20111
+ };
20112
+
20113
+ #ifdef GGML_NO_OMP
20114
+ if(j == 0)
20115
+ {
20116
+ /* No need to spawn a thread for main */
20117
+ ggml_graph_compute_thread(&workers[j]);
20118
+ }
20119
+ else
20120
+ {
20099
20121
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
20100
20122
GGML_ASSERT(rc == 0);
20101
20123
UNUSED(rc);
20102
20124
}
20125
+ #else
20126
+ ggml_graph_compute_thread(&workers[j]);
20127
+ #endif
20103
20128
}
20104
20129
20105
- workers[0].ith = 0;
20106
- workers[0].shared = &state_shared;
20107
- workers[0].ec = GGML_STATUS_SUCCESS;
20108
-
20109
- const int64_t perf_start_cycles = ggml_perf_cycles();
20110
- const int64_t perf_start_time_us = ggml_perf_time_us();
20130
+ #ifdef GGML_NO_OMP
20131
+ clear_numa_thread_affinity();
20132
+ #endif
20111
20133
20112
- // this is a work thread too
20113
- ggml_graph_compute_thread(&workers[0]);
20114
20134
enum ggml_status compute_status = workers[0].ec;
20115
20135
20116
- // don't leave affinity set on the main thread
20117
- clear_numa_thread_affinity();
20118
-
20119
20136
// join or kill thread pool
20120
- if (n_threads > 1 ) {
20121
- for (int j = 1; j < n_threads; j++) {
20122
- const int rc = ggml_thread_join(workers[j].thrd, NULL);
20123
- GGML_ASSERT(rc == 0);
20124
- if (workers[j].ec != GGML_STATUS_SUCCESS)
20125
- compute_status = workers[j].ec;
20126
- }
20137
+ for (int j = 1; j < n_threads; j++ ) {
20138
+ #ifdef GGML_NO_OMP
20139
+ const int rc = ggml_thread_join(workers[j].thrd, NULL);
20140
+ GGML_ASSERT(rc == 0);
20141
+ #endif
20142
+ if ( workers[j].ec != GGML_STATUS_SUCCESS)
20143
+ compute_status = workers[j].ec;
20127
20144
}
20128
20145
20129
20146
// performance stats (graph)
0 commit comments