@@ -7810,6 +7810,9 @@ static void ggml_compute_forward_acc_f32(
7810
7810
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
7811
7811
7812
7812
if (!inplace && (params->type == GGML_TASK_INIT)) {
7813
+ if (params->ith != 0) {
7814
+ return;
7815
+ }
7813
7816
// memcpy needs to be synchronized across threads to avoid race conditions.
7814
7817
// => do it in INIT phase
7815
7818
memcpy(
@@ -9952,21 +9955,45 @@ static void ggml_compute_forward_mul_mat(
9952
9955
9953
9956
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9954
9957
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9955
- if (params->ith != 0) {
9956
- return ;
9957
- }
9958
+ const int64_t ne_plane = ne01*ne00;
9959
+ const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float) ;
9960
+ UNUSED(desired_wsize);
9958
9961
9959
9962
if (params->type == GGML_TASK_INIT) {
9963
+ if (type != GGML_TYPE_F32) {
9964
+ assert(params->wsize >= desired_wsize);
9965
+ // parallelize by src0 rows
9966
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
9967
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
9968
+ // broadcast src0 into src1 across 2nd,3rd dimension
9969
+ const int64_t i03 = i13/r3;
9970
+ const int64_t i02 = i12/r2;
9971
+
9972
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9973
+ float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9974
+ ggml_to_float_t const to_float = type_traits[type].to_float;
9975
+
9976
+ for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
9977
+ to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
9978
+ }
9979
+ }
9980
+ }
9981
+ }
9960
9982
return;
9961
9983
}
9962
9984
9963
9985
if (params->type == GGML_TASK_FINALIZE) {
9964
9986
return;
9965
9987
}
9966
9988
9989
+ // perform sgemm, parallelization controlled by blas lib
9990
+ if (ith != 0) {
9991
+ return;
9992
+ }
9993
+
9994
+ const int64_t tgemm0 = ggml_perf_time_us();
9967
9995
for (int64_t i13 = 0; i13 < ne13; i13++) {
9968
9996
for (int64_t i12 = 0; i12 < ne12; i12++) {
9969
- // broadcast src0 into src1 across 2nd,3rd dimension
9970
9997
const int64_t i03 = i13/r3;
9971
9998
const int64_t i02 = i12/r2;
9972
9999
@@ -9975,17 +10002,7 @@ static void ggml_compute_forward_mul_mat(
9975
10002
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
9976
10003
9977
10004
if (type != GGML_TYPE_F32) {
9978
- float * const wdata = params->wdata;
9979
- ggml_to_float_t const to_float = type_traits[type].to_float;
9980
-
9981
- size_t id = 0;
9982
- for (int64_t i01 = 0; i01 < ne01; ++i01) {
9983
- to_float((const char *) x + i01*nb01, wdata + id, ne00);
9984
- id += ne00;
9985
- }
9986
-
9987
- assert(id*sizeof(float) <= params->wsize);
9988
- x = wdata;
10005
+ x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9989
10006
}
9990
10007
9991
10008
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9995,6 +10012,7 @@ static void ggml_compute_forward_mul_mat(
9995
10012
0.0f, d, ne01);
9996
10013
}
9997
10014
}
10015
+ //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
9998
10016
9999
10017
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
10000
10018
@@ -10003,6 +10021,9 @@ static void ggml_compute_forward_mul_mat(
10003
10021
#endif
10004
10022
10005
10023
if (params->type == GGML_TASK_INIT) {
10024
+ if (ith != 0) {
10025
+ return;
10026
+ }
10006
10027
if (src1->type != vec_dot_type) {
10007
10028
char * wdata = params->wdata;
10008
10029
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10167,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id(
10167
10188
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
10168
10189
10169
10190
if (params->type == GGML_TASK_INIT) {
10191
+ if (ith != 0) {
10192
+ return;
10193
+ }
10170
10194
char * wdata = params->wdata;
10171
10195
if (src1->type != vec_dot_type) {
10172
10196
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10352,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32(
10352
10376
return;
10353
10377
}
10354
10378
#endif
10379
+ if (ith != 0) {
10380
+ return;
10381
+ }
10355
10382
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10356
10383
return;
10357
10384
}
@@ -10535,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32(
10535
10562
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
10536
10563
10537
10564
if (params->type == GGML_TASK_INIT) {
10565
+ if (ith != 0) {
10566
+ return;
10567
+ }
10538
10568
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
10539
10569
return;
10540
10570
}
@@ -10719,6 +10749,9 @@ static void ggml_compute_forward_set_f32(
10719
10749
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
10720
10750
10721
10751
if (!inplace && (params->type == GGML_TASK_INIT)) {
10752
+ if (params->ith != 0) {
10753
+ return;
10754
+ }
10722
10755
// memcpy needs to be synchronized across threads to avoid race conditions.
10723
10756
// => do it in INIT phase
10724
10757
memcpy(
@@ -11043,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
11043
11076
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
11044
11077
11045
11078
if (params->type == GGML_TASK_INIT) {
11079
+ if (params->ith != 0) {
11080
+ return;
11081
+ }
11046
11082
memset(dst->data, 0, ggml_nbytes(dst));
11047
11083
}
11048
11084
@@ -11077,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32(
11077
11113
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
11078
11114
11079
11115
if (params->type == GGML_TASK_INIT) {
11116
+ if (params->ith != 0) {
11117
+ return;
11118
+ }
11080
11119
memset(dst->data, 0, ggml_nbytes(dst));
11081
11120
}
11082
11121
@@ -11214,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32(
11214
11253
GGML_ASSERT(n_past >= 0);
11215
11254
11216
11255
if (!inplace && (params->type == GGML_TASK_INIT)) {
11256
+ if (ith != 0) {
11257
+ return;
11258
+ }
11217
11259
// memcpy needs to be synchronized across threads to avoid race conditions.
11218
11260
// => do it in INIT phase
11219
11261
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12184,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
12184
12226
GGML_ASSERT(nb10 == sizeof(float));
12185
12227
12186
12228
if (params->type == GGML_TASK_INIT) {
12229
+ if (ith != 0) {
12230
+ return;
12231
+ }
12187
12232
memset(params->wdata, 0, params->wsize);
12188
12233
12189
12234
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12278,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
12278
12323
GGML_ASSERT(nb10 == sizeof(float));
12279
12324
12280
12325
if (params->type == GGML_TASK_INIT) {
12326
+ if (ith != 0) {
12327
+ return;
12328
+ }
12281
12329
memset(params->wdata, 0, params->wsize);
12282
12330
12283
12331
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12502,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d(
12502
12550
GGML_ASSERT(nb10 == sizeof(float));
12503
12551
12504
12552
if (params->type == GGML_TASK_INIT) {
12553
+ if (ith != 0) {
12554
+ return;
12555
+ }
12505
12556
memset(params->wdata, 0, params->wsize);
12506
12557
12507
12558
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -14116,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
14116
14167
14117
14168
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
14118
14169
if (!inplace && params->type == GGML_TASK_INIT) {
14170
+ if (params->ith != 0) {
14171
+ return;
14172
+ }
14119
14173
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
14120
14174
return;
14121
14175
}
@@ -16409,8 +16463,9 @@ struct ggml_compute_state_shared {
16409
16463
const int n_threads;
16410
16464
16411
16465
// synchronization primitives
16412
- atomic_int n_active; // num active threads
16413
- atomic_int node_n; // active graph node
16466
+ atomic_int n_active; // num active threads
16467
+ atomic_int node_n; // active graph node
16468
+ atomic_int node_task; // active graph node task phase
16414
16469
16415
16470
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
16416
16471
void * abort_callback_data;
@@ -16658,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
16658
16713
return n_tasks;
16659
16714
}
16660
16715
16716
+ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
16717
+ // wait for other threads to finish
16718
+ const int last_node_n = * node_n;
16719
+
16720
+ while (true) {
16721
+ if (do_yield) {
16722
+ sched_yield();
16723
+ }
16724
+
16725
+ * node_n = atomic_load(&state->shared->node_n);
16726
+ if (* node_n != last_node_n) break;
16727
+ }
16728
+ }
16729
+
16730
+ static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
16731
+ // wait for other threads to finish
16732
+ const int last_task_phase = * task_phase;
16733
+
16734
+ while (true) {
16735
+ if (do_yield) {
16736
+ sched_yield();
16737
+ }
16738
+
16739
+ * task_phase = atomic_load(&state->shared->node_task);
16740
+ if (* task_phase != last_task_phase) break;
16741
+ }
16742
+ }
16743
+
16661
16744
static thread_ret_t ggml_graph_compute_thread(void * data) {
16662
16745
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
16663
16746
@@ -16668,7 +16751,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16668
16751
16669
16752
set_numa_thread_affinity(state->ith, n_threads);
16670
16753
16671
- int node_n = -1;
16754
+ int node_n = -1;
16755
+ int task_phase = GGML_TASK_FINALIZE;
16672
16756
16673
16757
while (true) {
16674
16758
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16708,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16708
16792
16709
16793
params.nth = n_tasks;
16710
16794
16711
- /* INIT */
16712
- if (GGML_OP_HAS_INIT[node->op]) {
16713
- params.type = GGML_TASK_INIT;
16714
- ggml_compute_forward(¶ms, node);
16715
- }
16716
-
16717
16795
if (n_tasks == 1) {
16796
+ /* INIT */
16797
+ if (GGML_OP_HAS_INIT[node->op]) {
16798
+ params.type = GGML_TASK_INIT;
16799
+ ggml_compute_forward(¶ms, node);
16800
+ }
16801
+
16718
16802
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
16719
16803
// they do something more efficient than spinning (?)
16720
16804
params.type = GGML_TASK_COMPUTE;
@@ -16735,47 +16819,64 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
16735
16819
}
16736
16820
}
16737
16821
16738
- atomic_store(&state->shared->n_active, n_threads);
16739
- atomic_store(&state->shared->node_n, node_n);
16822
+ task_phase = GGML_TASK_INIT;
16823
+ atomic_store(&state->shared->n_active, n_threads);
16824
+ atomic_store(&state->shared->node_n, node_n);
16825
+ atomic_store(&state->shared->node_task, task_phase);
16740
16826
} else {
16741
- // wait for other threads to finish
16742
- const int last = node_n;
16743
-
16744
- const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16745
-
16746
- while (true) {
16747
- // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16748
- // depending on the workload and the operating system.
16749
- // since it is not clear what is the best approach, it should potentially become user-configurable
16750
- // ref: https://github.com/ggerganov/ggml/issues/291
16751
- // UPD: adding the do_yield flag seems to resolve the issue universally
16752
- if (do_yield) {
16753
- sched_yield();
16754
- }
16755
-
16756
- node_n = atomic_load(&state->shared->node_n);
16757
- if (node_n != last) break;
16758
- };
16827
+ ggml_graph_compute_thread_sync_node(&node_n, state, false);
16828
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16759
16829
}
16760
16830
16761
16831
// check if we should stop
16762
16832
if (node_n >= cgraph->n_nodes) break;
16763
16833
16764
- /* COMPUTE */
16834
+ /* INIT & COMPUTE */
16765
16835
struct ggml_tensor * node = cgraph->nodes[node_n];
16766
16836
const int n_tasks = ggml_get_n_tasks(node, n_threads);
16767
16837
16768
16838
struct ggml_compute_params params = {
16769
- /*.type =*/ GGML_TASK_COMPUTE ,
16839
+ /*.type =*/ GGML_TASK_INIT ,
16770
16840
/*.ith =*/ state->ith,
16771
16841
/*.nth =*/ n_tasks,
16772
16842
/*.wsize =*/ cplan->work_size,
16773
16843
/*.wdata =*/ cplan->work_data,
16774
16844
};
16775
16845
16776
16846
if (state->ith < n_tasks) {
16847
+ if (GGML_OP_HAS_INIT[node->op]) {
16848
+ ggml_compute_forward(¶ms, node);
16849
+ }
16850
+ }
16851
+
16852
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16853
+ task_phase = GGML_TASK_COMPUTE;
16854
+ atomic_store(&state->shared->n_active, n_threads);
16855
+ atomic_store(&state->shared->node_task, task_phase);
16856
+ }
16857
+ else {
16858
+ // TODO: this sched_yield can have significant impact on the performance - either positive or negative
16859
+ // depending on the workload and the operating system.
16860
+ // since it is not clear what is the best approach, it should potentially become user-configurable
16861
+ // ref: https://github.com/ggerganov/ggml/issues/291
16862
+ // UPD: adding the do_yield flag seems to resolve the issue universally
16863
+ const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
16864
+ ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
16865
+ }
16866
+
16867
+ if (state->ith < n_tasks) {
16868
+ params.type = GGML_TASK_COMPUTE;
16777
16869
ggml_compute_forward(¶ms, node);
16778
16870
}
16871
+
16872
+ if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16873
+ task_phase = GGML_TASK_FINALIZE;
16874
+ atomic_store(&state->shared->n_active, n_threads);
16875
+ atomic_store(&state->shared->node_task, task_phase);
16876
+ }
16877
+ else {
16878
+ ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16879
+ }
16779
16880
}
16780
16881
16781
16882
return GGML_EXIT_SUCCESS;
@@ -16832,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
16832
16933
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16833
16934
if (ggml_compute_forward_mul_mat_use_blas(node)) {
16834
16935
if (node->src[0]->type != GGML_TYPE_F32) {
16835
- // here we need memory just for single 2D matrix from src0
16836
- cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1 ]);
16936
+ // here we need memory for fully dequantized matrix from src0
16937
+ cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements (node->src[0]);
16837
16938
}
16838
16939
} else
16839
16940
#endif
@@ -16987,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
16987
17088
/*.n_threads =*/ n_threads,
16988
17089
/*.n_active =*/ n_threads,
16989
17090
/*.node_n =*/ -1,
17091
+ /*.node_task =*/ GGML_TASK_FINALIZE,
16990
17092
/*.abort_callback =*/ NULL,
16991
17093
/*.abort_callback_data =*/ NULL,
16992
17094
};
0 commit comments