From 47edd35d7841e02623d4c0e7ba9fbf72971f58de Mon Sep 17 00:00:00 2001
From: Jean-Baptiste BESNARD <jbbesnard@paratools.fr>
Date: Mon, 20 May 2024 00:10:54 +0200
Subject: [PATCH] OpenMP: remove repetitive thread creation using OpenMP

- Add OpenMP to dependency detection
- Add GGML_NO_OMP if OpenMP is disabled or not found
- Keep previous approach if there is no OMP
---
 CMakeLists.txt | 20 ++++++++++++++
 Makefile       |  8 ++++++
 ggml.c         | 71 +++++++++++++++++++++++++++++++-------------------
 3 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbeb2ee37500e..9c325e8bf464a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
+option(LLAMA_DISABLE_OMP                     "Disable OpenMP support"                           OFF)
 option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
@@ -1242,6 +1243,25 @@ if (BUILD_SHARED_LIBS)
     install(TARGETS ggml_shared LIBRARY)
 endif()
 
+
+if (NOT LLAMA_DISABLE_OMP)
+    find_package(OpenMP)
+    if(OpenMP_CXX_FOUND)
+        message(STATUS "OpenMP found")
+        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+        target_link_libraries(ggml PUBLIC OpenMP::OpenMP_CXX)
+    else()
+        message(STATUS "OpenMP NOT found activating standard threading")
+        add_compile_definitions(GGML_NO_OMP)
+    endif()
+else()
+    message(STATUS "OpenMP disabled activating standard threading")
+    add_compile_definitions(GGML_NO_OMP)
+endif()
+
+
 # llama
 
 add_library(llama
diff --git a/Makefile b/Makefile
index 22d5218565d23..c61d732fed982 100644
--- a/Makefile
+++ b/Makefile
@@ -406,6 +406,14 @@ ifdef LLAMA_MPI
 	OBJS        += ggml-mpi.o
 endif # LLAMA_MPI
 
+ifndef LLAMA_NO_OMP
+	MK_CPPFLAGS+= -fopenmp
+	MK_CFLAGS+= -fopenmp
+	MK_LDFLAGS+= -fopenmp
+else
+	MK_CPPFLAGS+= -DGGML_NO_OMP
+endif
+
 ifdef LLAMA_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
diff --git a/ggml.c b/ggml.c
index 3a104c486339e..276a319bd6d08 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;
 
 #endif
 
+#ifdef GGML_NO_OMP
+
+
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
+
 static void set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
@@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {
 
     CPU_FREE(cpus);
 }
+
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
 static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
+
+#endif
+
+
 #endif
 
 static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
     const int   n_threads   = state->shared->n_threads;
 
+#ifdef GGML_NO_OMP
     set_numa_thread_affinity(state->ith);
+#endif
 
     int node_n     = -1;
     int task_phase = GGML_TASK_TYPE_FINALIZE;
@@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
-    // create thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; ++j) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .ith = j,
-                .shared = &state_shared,
-                .ec = GGML_STATUS_SUCCESS,
-            };
+    const int64_t perf_start_cycles  = ggml_perf_cycles();
+    const int64_t perf_start_time_us = ggml_perf_time_us();
 
+    /* Loop is reversed as in the NO_OMP case we want threads to start
+    before the main thread (j==0) */
+    #pragma omp parallel for shared(workers,state_shared)
+    for (int j = n_threads - 1; 0 <= j; j--) {
+        workers[j] = (struct ggml_compute_state) {
+            .ith = j,
+            .shared = &state_shared,
+            .ec = GGML_STATUS_SUCCESS,
+        };
+
+#ifdef GGML_NO_OMP
+        if(j == 0)
+        {
+            /* No need to spawn a thread for main */
+            ggml_graph_compute_thread(&workers[j]);
+        }
+        else
+        {
             const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
             GGML_ASSERT(rc == 0);
             UNUSED(rc);
         }
+#else
+        ggml_graph_compute_thread(&workers[j]);
+#endif
     }
 
-    workers[0].ith = 0;
-    workers[0].shared = &state_shared;
-    workers[0].ec = GGML_STATUS_SUCCESS;
-
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
+#ifdef GGML_NO_OMP
+    clear_numa_thread_affinity();
+#endif
 
-    // this is a work thread too
-    ggml_graph_compute_thread(&workers[0]);
     enum ggml_status compute_status = workers[0].ec;
 
-    // don't leave affinity set on the main thread
-    clear_numa_thread_affinity();
-
     // join or kill thread pool
-    if (n_threads > 1) {
-        for (int j = 1; j < n_threads; j++) {
-            const int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            if (workers[j].ec != GGML_STATUS_SUCCESS)
-                compute_status = workers[j].ec;
-        }
+    for (int j = 1; j < n_threads; j++) {
+#ifdef GGML_NO_OMP
+        const int rc = ggml_thread_join(workers[j].thrd, NULL);
+        GGML_ASSERT(rc == 0);
+#endif
+        if (workers[j].ec != GGML_STATUS_SUCCESS)
+            compute_status = workers[j].ec;
     }
 
     // performance stats (graph)