Skip to content

Commit 0cee572

Browse files
ReinForce-IIggerganov
authored andcommitted
ggml : parallelize FP32 conversion when using BLAS (ggml-org#5045)
* make GGML_TASK_INIT phase can be run in multithread * multithreaded dequantize in mul_mat when using blas library * minor fixes * update outdated comment * fix coding style * simplify code Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 8cdc4f4 commit 0cee572

File tree

1 file changed

+150
-48
lines changed

1 file changed

+150
-48
lines changed

ggml.c

Lines changed: 150 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7810,6 +7810,9 @@ static void ggml_compute_forward_acc_f32(
78107810
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
78117811

78127812
if (!inplace && (params->type == GGML_TASK_INIT)) {
7813+
if (params->ith != 0) {
7814+
return;
7815+
}
78137816
// memcpy needs to be synchronized across threads to avoid race conditions.
78147817
// => do it in INIT phase
78157818
memcpy(
@@ -9952,21 +9955,45 @@ static void ggml_compute_forward_mul_mat(
99529955

99539956
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
99549957
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
9955-
if (params->ith != 0) {
9956-
return;
9957-
}
9958+
const int64_t ne_plane = ne01*ne00;
9959+
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
9960+
UNUSED(desired_wsize);
99589961

99599962
if (params->type == GGML_TASK_INIT) {
9963+
if (type != GGML_TYPE_F32) {
9964+
assert(params->wsize >= desired_wsize);
9965+
// parallelize by src0 rows
9966+
for (int64_t i13 = 0; i13 < ne13; i13++) {
9967+
for (int64_t i12 = 0; i12 < ne12; i12++) {
9968+
// broadcast src0 into src1 across 2nd,3rd dimension
9969+
const int64_t i03 = i13/r3;
9970+
const int64_t i02 = i12/r2;
9971+
9972+
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
9973+
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
9974+
ggml_to_float_t const to_float = type_traits[type].to_float;
9975+
9976+
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
9977+
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
9978+
}
9979+
}
9980+
}
9981+
}
99609982
return;
99619983
}
99629984

99639985
if (params->type == GGML_TASK_FINALIZE) {
99649986
return;
99659987
}
99669988

9989+
// perform sgemm, parallelization controlled by blas lib
9990+
if (ith != 0) {
9991+
return;
9992+
}
9993+
9994+
const int64_t tgemm0 = ggml_perf_time_us();
99679995
for (int64_t i13 = 0; i13 < ne13; i13++) {
99689996
for (int64_t i12 = 0; i12 < ne12; i12++) {
9969-
// broadcast src0 into src1 across 2nd,3rd dimension
99709997
const int64_t i03 = i13/r3;
99719998
const int64_t i02 = i12/r2;
99729999

@@ -9975,17 +10002,7 @@ static void ggml_compute_forward_mul_mat(
997510002
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
997610003

997710004
if (type != GGML_TYPE_F32) {
9978-
float * const wdata = params->wdata;
9979-
ggml_to_float_t const to_float = type_traits[type].to_float;
9980-
9981-
size_t id = 0;
9982-
for (int64_t i01 = 0; i01 < ne01; ++i01) {
9983-
to_float((const char *) x + i01*nb01, wdata + id, ne00);
9984-
id += ne00;
9985-
}
9986-
9987-
assert(id*sizeof(float) <= params->wsize);
9988-
x = wdata;
10005+
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
998910006
}
999010007

999110008
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9995,6 +10012,7 @@ static void ggml_compute_forward_mul_mat(
999510012
0.0f, d, ne01);
999610013
}
999710014
}
10015+
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
999810016

999910017
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
1000010018

@@ -10003,6 +10021,9 @@ static void ggml_compute_forward_mul_mat(
1000310021
#endif
1000410022

1000510023
if (params->type == GGML_TASK_INIT) {
10024+
if (ith != 0) {
10025+
return;
10026+
}
1000610027
if (src1->type != vec_dot_type) {
1000710028
char * wdata = params->wdata;
1000810029
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10167,6 +10188,9 @@ static void ggml_compute_forward_mul_mat_id(
1016710188
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
1016810189

1016910190
if (params->type == GGML_TASK_INIT) {
10191+
if (ith != 0) {
10192+
return;
10193+
}
1017010194
char * wdata = params->wdata;
1017110195
if (src1->type != vec_dot_type) {
1017210196
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10352,6 +10376,9 @@ static void ggml_compute_forward_out_prod_f32(
1035210376
return;
1035310377
}
1035410378
#endif
10379+
if (ith != 0) {
10380+
return;
10381+
}
1035510382
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
1035610383
return;
1035710384
}
@@ -10535,6 +10562,9 @@ static void ggml_compute_forward_out_prod_q_f32(
1053510562
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
1053610563

1053710564
if (params->type == GGML_TASK_INIT) {
10565+
if (ith != 0) {
10566+
return;
10567+
}
1053810568
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
1053910569
return;
1054010570
}
@@ -10719,6 +10749,9 @@ static void ggml_compute_forward_set_f32(
1071910749
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
1072010750

1072110751
if (!inplace && (params->type == GGML_TASK_INIT)) {
10752+
if (params->ith != 0) {
10753+
return;
10754+
}
1072210755
// memcpy needs to be synchronized across threads to avoid race conditions.
1072310756
// => do it in INIT phase
1072410757
memcpy(
@@ -11043,6 +11076,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
1104311076
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
1104411077

1104511078
if (params->type == GGML_TASK_INIT) {
11079+
if (params->ith != 0) {
11080+
return;
11081+
}
1104611082
memset(dst->data, 0, ggml_nbytes(dst));
1104711083
}
1104811084

@@ -11077,6 +11113,9 @@ static void ggml_compute_forward_get_rows_back_f32(
1107711113
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
1107811114

1107911115
if (params->type == GGML_TASK_INIT) {
11116+
if (params->ith != 0) {
11117+
return;
11118+
}
1108011119
memset(dst->data, 0, ggml_nbytes(dst));
1108111120
}
1108211121

@@ -11214,6 +11253,9 @@ static void ggml_compute_forward_diag_mask_f32(
1121411253
GGML_ASSERT(n_past >= 0);
1121511254

1121611255
if (!inplace && (params->type == GGML_TASK_INIT)) {
11256+
if (ith != 0) {
11257+
return;
11258+
}
1121711259
// memcpy needs to be synchronized across threads to avoid race conditions.
1121811260
// => do it in INIT phase
1121911261
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12184,6 +12226,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
1218412226
GGML_ASSERT(nb10 == sizeof(float));
1218512227

1218612228
if (params->type == GGML_TASK_INIT) {
12229+
if (ith != 0) {
12230+
return;
12231+
}
1218712232
memset(params->wdata, 0, params->wsize);
1218812233

1218912234
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12278,6 +12323,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
1227812323
GGML_ASSERT(nb10 == sizeof(float));
1227912324

1228012325
if (params->type == GGML_TASK_INIT) {
12326+
if (ith != 0) {
12327+
return;
12328+
}
1228112329
memset(params->wdata, 0, params->wsize);
1228212330

1228312331
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12502,6 +12550,9 @@ static void ggml_compute_forward_conv_transpose_2d(
1250212550
GGML_ASSERT(nb10 == sizeof(float));
1250312551

1250412552
if (params->type == GGML_TASK_INIT) {
12553+
if (ith != 0) {
12554+
return;
12555+
}
1250512556
memset(params->wdata, 0, params->wsize);
1250612557

1250712558
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -14116,6 +14167,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
1411614167

1411714168
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
1411814169
if (!inplace && params->type == GGML_TASK_INIT) {
14170+
if (params->ith != 0) {
14171+
return;
14172+
}
1411914173
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
1412014174
return;
1412114175
}
@@ -16409,8 +16463,9 @@ struct ggml_compute_state_shared {
1640916463
const int n_threads;
1641016464

1641116465
// synchronization primitives
16412-
atomic_int n_active; // num active threads
16413-
atomic_int node_n; // active graph node
16466+
atomic_int n_active; // num active threads
16467+
atomic_int node_n; // active graph node
16468+
atomic_int node_task; // active graph node task phase
1641416469

1641516470
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
1641616471
void * abort_callback_data;
@@ -16658,6 +16713,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1665816713
return n_tasks;
1665916714
}
1666016715

16716+
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
16717+
// wait for other threads to finish
16718+
const int last_node_n = * node_n;
16719+
16720+
while (true) {
16721+
if (do_yield) {
16722+
sched_yield();
16723+
}
16724+
16725+
* node_n = atomic_load(&state->shared->node_n);
16726+
if (* node_n != last_node_n) break;
16727+
}
16728+
}
16729+
16730+
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
16731+
// wait for other threads to finish
16732+
const int last_task_phase = * task_phase;
16733+
16734+
while (true) {
16735+
if (do_yield) {
16736+
sched_yield();
16737+
}
16738+
16739+
* task_phase = atomic_load(&state->shared->node_task);
16740+
if (* task_phase != last_task_phase) break;
16741+
}
16742+
}
16743+
1666116744
static thread_ret_t ggml_graph_compute_thread(void * data) {
1666216745
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
1666316746

@@ -16668,7 +16751,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1666816751

1666916752
set_numa_thread_affinity(state->ith, n_threads);
1667016753

16671-
int node_n = -1;
16754+
int node_n = -1;
16755+
int task_phase = GGML_TASK_FINALIZE;
1667216756

1667316757
while (true) {
1667416758
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16708,13 +16792,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1670816792

1670916793
params.nth = n_tasks;
1671016794

16711-
/* INIT */
16712-
if (GGML_OP_HAS_INIT[node->op]) {
16713-
params.type = GGML_TASK_INIT;
16714-
ggml_compute_forward(&params, node);
16715-
}
16716-
1671716795
if (n_tasks == 1) {
16796+
/* INIT */
16797+
if (GGML_OP_HAS_INIT[node->op]) {
16798+
params.type = GGML_TASK_INIT;
16799+
ggml_compute_forward(&params, node);
16800+
}
16801+
1671816802
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
1671916803
// they do something more efficient than spinning (?)
1672016804
params.type = GGML_TASK_COMPUTE;
@@ -16735,47 +16819,64 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1673516819
}
1673616820
}
1673716821

16738-
atomic_store(&state->shared->n_active, n_threads);
16739-
atomic_store(&state->shared->node_n, node_n);
16822+
task_phase = GGML_TASK_INIT;
16823+
atomic_store(&state->shared->n_active, n_threads);
16824+
atomic_store(&state->shared->node_n, node_n);
16825+
atomic_store(&state->shared->node_task, task_phase);
1674016826
} else {
16741-
// wait for other threads to finish
16742-
const int last = node_n;
16743-
16744-
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16745-
16746-
while (true) {
16747-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
16748-
// depending on the workload and the operating system.
16749-
// since it is not clear what is the best approach, it should potentially become user-configurable
16750-
// ref: https://github.com/ggerganov/ggml/issues/291
16751-
// UPD: adding the do_yield flag seems to resolve the issue universally
16752-
if (do_yield) {
16753-
sched_yield();
16754-
}
16755-
16756-
node_n = atomic_load(&state->shared->node_n);
16757-
if (node_n != last) break;
16758-
};
16827+
ggml_graph_compute_thread_sync_node(&node_n, state, false);
16828+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
1675916829
}
1676016830

1676116831
// check if we should stop
1676216832
if (node_n >= cgraph->n_nodes) break;
1676316833

16764-
/* COMPUTE */
16834+
/* INIT & COMPUTE */
1676516835
struct ggml_tensor * node = cgraph->nodes[node_n];
1676616836
const int n_tasks = ggml_get_n_tasks(node, n_threads);
1676716837

1676816838
struct ggml_compute_params params = {
16769-
/*.type =*/ GGML_TASK_COMPUTE,
16839+
/*.type =*/ GGML_TASK_INIT,
1677016840
/*.ith =*/ state->ith,
1677116841
/*.nth =*/ n_tasks,
1677216842
/*.wsize =*/ cplan->work_size,
1677316843
/*.wdata =*/ cplan->work_data,
1677416844
};
1677516845

1677616846
if (state->ith < n_tasks) {
16847+
if (GGML_OP_HAS_INIT[node->op]) {
16848+
ggml_compute_forward(&params, node);
16849+
}
16850+
}
16851+
16852+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16853+
task_phase = GGML_TASK_COMPUTE;
16854+
atomic_store(&state->shared->n_active, n_threads);
16855+
atomic_store(&state->shared->node_task, task_phase);
16856+
}
16857+
else {
16858+
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
16859+
// depending on the workload and the operating system.
16860+
// since it is not clear what is the best approach, it should potentially become user-configurable
16861+
// ref: https://github.com/ggerganov/ggml/issues/291
16862+
// UPD: adding the do_yield flag seems to resolve the issue universally
16863+
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
16864+
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
16865+
}
16866+
16867+
if (state->ith < n_tasks) {
16868+
params.type = GGML_TASK_COMPUTE;
1677716869
ggml_compute_forward(&params, node);
1677816870
}
16871+
16872+
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
16873+
task_phase = GGML_TASK_FINALIZE;
16874+
atomic_store(&state->shared->n_active, n_threads);
16875+
atomic_store(&state->shared->node_task, task_phase);
16876+
}
16877+
else {
16878+
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
16879+
}
1677916880
}
1678016881

1678116882
return GGML_EXIT_SUCCESS;
@@ -16832,8 +16933,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1683216933
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
1683316934
if (ggml_compute_forward_mul_mat_use_blas(node)) {
1683416935
if (node->src[0]->type != GGML_TYPE_F32) {
16835-
// here we need memory just for single 2D matrix from src0
16836-
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
16936+
// here we need memory for fully dequantized matrix from src0
16937+
cur = ggml_type_size(GGML_TYPE_F32)*ggml_nelements(node->src[0]);
1683716938
}
1683816939
} else
1683916940
#endif
@@ -16987,6 +17088,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
1698717088
/*.n_threads =*/ n_threads,
1698817089
/*.n_active =*/ n_threads,
1698917090
/*.node_n =*/ -1,
17091+
/*.node_task =*/ GGML_TASK_FINALIZE,
1699017092
/*.abort_callback =*/ NULL,
1699117093
/*.abort_callback_data =*/ NULL,
1699217094
};

0 commit comments

Comments
 (0)