ggml: parallelize to_float when using blas

ReinForce-II · ReinForce-II · commit cb828c8d5a18 · 2024-01-16T19:41:11.000Z
* converting fp16 to fp32 or dequantization on signle thread can be bottleneck rather than gemm.
diff --git a/ggml.c b/ggml.c
@@ -1883,6 +1883,8 @@ struct ggml_state {
 static struct ggml_state g_state;
 static atomic_int g_state_barrier = 0;
 
+static atomic_int g_blas_pending = 0;
+
 // barrier via spin lock
 inline static void ggml_critical_section_start(void) {
     int processing = atomic_fetch_add(&g_state_barrier, 1);
@@ -9835,21 +9837,53 @@ static void ggml_compute_forward_mul_mat(
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        if (params->ith != 0) {
-            return;
-        }
+        const int64_t ne_plane      = ne01*ne00;
+        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
+        UNUSED(desired_wsize);
 
         if (params->type == GGML_TASK_INIT) {
+            if (type != GGML_TYPE_F32) {
+                assert(params->wsize >= desired_wsize);
+                atomic_store(&g_blas_pending, params->nth);
+            }
             return;
         }
 
         if (params->type == GGML_TASK_FINALIZE) {
             return;
         }
 
+        if (type != GGML_TYPE_F32) {
+            // parallelize by src0 rows
+            for (int64_t i13 = 0; i13 < ne13; i13++) {
+                for (int64_t i12 = 0; i12 < ne12; i12++) {
+                    // broadcast src0 into src1 across 2nd,3rd dimension
+                    const int64_t i03 = i13/r3;
+                    const int64_t i02 = i12/r2;
+
+                    const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
+                          float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+                          ggml_to_float_t  const to_float = type_traits[type].to_float;
+
+                    for (int64_t i01 = ith; i01 < ne01; i01+=nth) {
+                        to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
+                    }
+                }
+            }
+            atomic_fetch_sub(&g_blas_pending, 1);
+            while (atomic_load(&g_blas_pending) != 0) {
+                // sched_yield();
+            }
+        }
+
+        // perform sgemm, parallelization controlled by blas lib
+        if (ith != 0) {
+            return;
+        }
+
+        const int64_t tgemm0 = ggml_perf_time_us();
         for (int64_t i13 = 0; i13 < ne13; i13++) {
             for (int64_t i12 = 0; i12 < ne12; i12++) {
-                // broadcast src0 into src1 across 2nd,3rd dimension
                 const int64_t i03 = i13/r3;
                 const int64_t i02 = i12/r2;
 
@@ -9858,17 +9892,7 @@ static void ggml_compute_forward_mul_mat(
                       float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);
 
                 if (type != GGML_TYPE_F32) {
-                            float * const wdata    = params->wdata;
-                    ggml_to_float_t const to_float = type_traits[type].to_float;
-
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                    x = wdata;
+                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
                 }
 
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -9878,6 +9902,7 @@ static void ggml_compute_forward_mul_mat(
                          0.0f,    d, ne01);
             }
         }
+        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
 
         //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);