From 47edd35d7841e02623d4c0e7ba9fbf72971f58de Mon Sep 17 00:00:00 2001 From: Jean-Baptiste BESNARD Date: Mon, 20 May 2024 00:10:54 +0200 Subject: [PATCH] OpenMP: remove repetitive thread creation using OpenMP - Add OpenMP to dependency detection - Add GGML_NO_OMP if OpenMP is disabled or not found - Keep previous approach if there is no OMP --- CMakeLists.txt | 20 ++++++++++++++ Makefile | 8 ++++++ ggml.c | 71 +++++++++++++++++++++++++++++++------------------- 3 files changed, 72 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cbeb2ee37500e..9c325e8bf464a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) +option(LLAMA_DISABLE_OMP "Disable OpenMP support" OFF) option(LLAMA_RPC "llama: use RPC" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) @@ -1242,6 +1243,25 @@ if (BUILD_SHARED_LIBS) install(TARGETS ggml_shared LIBRARY) endif() + +if (NOT LLAMA_DISABLE_OMP) + find_package(OpenMP) + if(OpenMP_CXX_FOUND) + message(STATUS "OpenMP found") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") + target_link_libraries(ggml PUBLIC OpenMP::OpenMP_CXX) + else() + message(STATUS "OpenMP NOT found activating standard threading") + add_compile_definitions(GGML_NO_OMP) + endif() +else() + message(STATUS "OpenMP disabled activating standard threading") + add_compile_definitions(GGML_NO_OMP) +endif() + + # llama add_library(llama diff --git a/Makefile b/Makefile index 22d5218565d23..c61d732fed982 100644 --- a/Makefile +++ b/Makefile @@ -406,6 +406,14 @@ ifdef LLAMA_MPI OBJS += ggml-mpi.o endif # LLAMA_MPI +ifndef LLAMA_NO_OMP + MK_CPPFLAGS+= -fopenmp + MK_CFLAGS+= -fopenmp + MK_LDFLAGS+= -fopenmp +else + MK_CPPFLAGS+= -DGGML_NO_OMP +endif + ifdef LLAMA_OPENBLAS MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) diff --git a/ggml.c b/ggml.c index 3a104c486339e..276a319bd6d08 100644 --- a/ggml.c +++ b/ggml.c @@ -19334,8 +19334,12 @@ typedef int ggml_lock_t; #endif +#ifdef GGML_NO_OMP + + // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) + static void set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; @@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) { CPU_FREE(cpus); } + #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} + +#endif + + #endif static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { @@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_threads = state->shared->n_threads; +#ifdef GGML_NO_OMP set_numa_thread_affinity(state->ith); +#endif int node_n = -1; int task_phase = GGML_TASK_TYPE_FINALIZE; @@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl }; struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); - // create thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; ++j) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .ith = j, - .shared = &state_shared, - .ec = GGML_STATUS_SUCCESS, - }; + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); + /* Loop is reversed as in the NO_OMP case we want threads to start + before the main thread (j==0) */ + #pragma omp parallel for shared(workers,state_shared) + for (int j = n_threads - 1; 0 <= j; j--) { + workers[j] = (struct ggml_compute_state) { + .ith = j, + .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, + }; + +#ifdef GGML_NO_OMP + if(j == 0) + { + /* No need to spawn a thread for main */ + ggml_graph_compute_thread(&workers[j]); + } + else + { const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); GGML_ASSERT(rc == 0); UNUSED(rc); } +#else + ggml_graph_compute_thread(&workers[j]); +#endif } - workers[0].ith = 0; - workers[0].shared = &state_shared; - workers[0].ec = GGML_STATUS_SUCCESS; - - const int64_t perf_start_cycles = ggml_perf_cycles(); - const int64_t perf_start_time_us = ggml_perf_time_us(); +#ifdef GGML_NO_OMP + clear_numa_thread_affinity(); +#endif - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); enum ggml_status compute_status = workers[0].ec; - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - if (workers[j].ec != GGML_STATUS_SUCCESS) - compute_status = workers[j].ec; - } + for (int j = 1; j < n_threads; j++) { +#ifdef GGML_NO_OMP + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); +#endif + if (workers[j].ec != GGML_STATUS_SUCCESS) + compute_status = workers[j].ec; } // performance stats (graph)