Skip to content

Commit 086c5f6

Browse files
ggerganovhodlen
authored andcommitted
ggml : do not sched_yield when calling BLAS (ggml-org#4761)
* ggml : do not sched_yield when calling BLAS ggml-ci * ggml : fix do_yield logic ggml-ci * ggml : simplify do_yield logic ggml-ci
1 parent fb7bb4c commit 086c5f6

File tree

1 file changed

+14
-27
lines changed

1 file changed

+14
-27
lines changed

ggml.c

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9704,10 +9704,10 @@ static void ggml_compute_forward_group_norm(
97049704
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
97059705
// helper function to determine if it is better to use BLAS or not
97069706
// for large matrices, BLAS is faster
9707-
static bool ggml_compute_forward_mul_mat_use_blas(
9708-
const struct ggml_tensor * src0,
9709-
const struct ggml_tensor * src1,
9710-
struct ggml_tensor * dst) {
9707+
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
9708+
const struct ggml_tensor * src0 = dst->src[0];
9709+
const struct ggml_tensor * src1 = dst->src[1];
9710+
97119711
//const int64_t ne00 = src0->ne[0];
97129712
//const int64_t ne01 = src0->ne[1];
97139713

@@ -9787,7 +9787,7 @@ static void ggml_compute_forward_mul_mat(
97879787
#endif
97889788

97899789
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
9790-
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
9790+
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
97919791
if (params->ith != 0) {
97929792
return;
97939793
}
@@ -16301,24 +16301,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
1630116301

1630216302
//n_tasks = MIN(n_threads, MAX(1, nr0/128));
1630316303
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
16304-
16305-
#if defined(GGML_USE_CUBLAS)
16306-
if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
16307-
n_tasks = 1; // TODO: this actually is doing nothing
16308-
// the threads are still spinning
16309-
}
16310-
#elif defined(GGML_USE_CLBLAST)
16311-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
16312-
n_tasks = 1; // TODO: this actually is doing nothing
16313-
// the threads are still spinning
16314-
}
16315-
#endif
16316-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16317-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16318-
n_tasks = 1; // TODO: this actually is doing nothing
16319-
// the threads are still spinning
16320-
}
16321-
#endif
1632216304
} break;
1632316305
case GGML_OP_MUL_MAT_ID:
1632416306
{
@@ -16491,6 +16473,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1649116473
state->shared->node_n += 1;
1649216474
return (thread_ret_t) GGML_EXIT_ABORTED;
1649316475
}
16476+
1649416477
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
1649516478
// all other threads are finished and spinning
1649616479
// do finalize and init here so we don't have synchronize again
@@ -16556,14 +16539,18 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1655616539
} else {
1655716540
// wait for other threads to finish
1655816541
const int last = node_n;
16542+
16543+
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
16544+
1655916545
while (true) {
1656016546
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
1656116547
// depending on the workload and the operating system.
1656216548
// since it is not clear what is the best approach, it should potentially become user-configurable
1656316549
// ref: https://github.com/ggerganov/ggml/issues/291
16564-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16565-
sched_yield();
16566-
#endif
16550+
// UPD: adding the do_yield flag seems to resolve the issue universally
16551+
if (do_yield) {
16552+
sched_yield();
16553+
}
1656716554

1656816555
node_n = atomic_load(&state->shared->node_n);
1656916556
if (node_n != last) break;
@@ -16642,7 +16629,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
1664216629
} else
1664316630
#endif
1664416631
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
16645-
if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
16632+
if (ggml_compute_forward_mul_mat_use_blas(node)) {
1664616633
if (node->src[0]->type != GGML_TYPE_F32) {
1664716634
// here we need memory just for single 2D matrix from src0
1664816635
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);

0 commit comments

Comments
 (0)