Skip to content

Commit a5735e4

Browse files
msy-katoslarenggerganov
authored
ggml : use OpenMP as a thread pool (#7606)
* ggml: Added OpenMP for multi-threads processing * ggml : Limit the number of threads used to avoid deadlock * update shared state n_threads in parallel region * clear numa affinity for main thread even with openmp * enable openmp by default * fix msvc build * disable openmp on macos * ci : disable openmp with thread sanitizer * Update ggml.c Co-authored-by: Georgi Gerganov <[email protected]> --------- Co-authored-by: slaren <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 0b832d5 commit a5735e4

File tree

4 files changed

+103
-38
lines changed

4 files changed

+103
-38
lines changed

.github/workflows/build.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,22 @@ jobs:
294294
295295
- name: Build
296296
id: cmake_build
297+
if: ${{ matrix.sanitizer != 'THREAD' }}
297298
run: |
298299
mkdir build
299300
cd build
300301
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
301302
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
302303
304+
- name: Build (no OpenMP)
305+
id: cmake_build_no_openmp
306+
if: ${{ matrix.sanitizer == 'THREAD' }}
307+
run: |
308+
mkdir build
309+
cd build
310+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
311+
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
312+
303313
- name: Test
304314
id: cmake_test
305315
run: |

CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
126126
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
127127
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
128128
option(LLAMA_RPC "llama: use RPC" OFF)
129+
option(LLAMA_OPENMP "llama: use OpenMP" ON)
129130
option(LLAMA_SYCL "llama: use SYCL" OFF)
130131
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
131132
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -296,6 +297,17 @@ if (LLAMA_METAL)
296297
)
297298
endif()
298299

300+
if (LLAMA_OPENMP)
301+
find_package(OpenMP)
302+
if (OpenMP_FOUND)
303+
message(STATUS "OpenMP found")
304+
add_compile_definitions(GGML_USE_OPENMP)
305+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
306+
else()
307+
message(WARNING "OpenMP not found")
308+
endif()
309+
endif()
310+
299311
if (LLAMA_BLAS)
300312
if (LLAMA_STATIC)
301313
set(BLA_STATIC ON)

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
5757
LLAMA_METAL := 1
5858
endif
5959

60+
LLAMA_NO_OPENMP := 1
61+
6062
ifneq ($(UNAME_P),arm)
6163
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
6264
ifeq ($(SYSCTL_M),1)
@@ -405,6 +407,12 @@ ifndef LLAMA_NO_ACCELERATE
405407
endif
406408
endif # LLAMA_NO_ACCELERATE
407409

410+
ifndef LLAMA_NO_OPENMP
411+
MK_CPPFLAGS += -DGGML_USE_OPENMP
412+
MK_CFLAGS += -fopenmp
413+
MK_CXXFLAGS += -fopenmp
414+
endif # LLAMA_NO_OPENMP
415+
408416
ifdef LLAMA_OPENBLAS
409417
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
410418
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)

ggml.c

Lines changed: 73 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "ggml-quants.h"
66
#include "ggml.h"
77

8+
89
#if defined(_MSC_VER) || defined(__MINGW32__)
910
#include <malloc.h> // using malloc.h with MSC/MINGW
1011
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -28,6 +29,10 @@
2829
#include <syscall.h>
2930
#endif
3031

32+
#ifdef GGML_USE_OPENMP
33+
#include <omp.h>
34+
#endif
35+
3136
#ifdef GGML_USE_METAL
3237
#include <unistd.h>
3338
#endif
@@ -1756,7 +1761,7 @@ struct ggml_compute_state_shared {
17561761
int64_t perf_node_start_cycles;
17571762
int64_t perf_node_start_time_us;
17581763

1759-
const int n_threads;
1764+
int n_threads;
17601765

17611766
// synchronization primitives
17621767
atomic_int n_active; // num active threads
@@ -19670,6 +19675,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
1967019675
return cplan;
1967119676
}
1967219677

19678+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
19679+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
19680+
19681+
#ifdef GGML_USE_OPENMP
19682+
if (n_threads > 1) {
19683+
#pragma omp parallel num_threads(n_threads)
19684+
{
19685+
#pragma omp single
19686+
{
19687+
// update the number of threads from the actual number of threads that we got from OpenMP
19688+
n_threads = omp_get_num_threads();
19689+
workers[0].shared->n_threads = n_threads;
19690+
workers[0].shared->n_active = n_threads;
19691+
}
19692+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
19693+
}
19694+
} else {
19695+
ggml_graph_compute_thread(&workers[0]);
19696+
}
19697+
#else
19698+
// create thread pool
19699+
if (n_threads > 1) {
19700+
for (int j = 1; j < n_threads; ++j) {
19701+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19702+
GGML_ASSERT(rc == 0);
19703+
UNUSED(rc);
19704+
}
19705+
}
19706+
19707+
// this is a work thread too
19708+
ggml_graph_compute_thread(&workers[0]);
19709+
19710+
// join or kill thread pool
19711+
if (n_threads > 1) {
19712+
for (int j = 1; j < n_threads; j++) {
19713+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
19714+
GGML_ASSERT(rc == 0);
19715+
UNUSED(rc);
19716+
}
19717+
}
19718+
#endif
19719+
// don't leave affinity set on the main thread
19720+
clear_numa_thread_affinity();
19721+
19722+
for (int j = 0; j < n_threads; j++) {
19723+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
19724+
compute_status = workers[j].ec;
19725+
break;
19726+
}
19727+
}
19728+
return compute_status;
19729+
}
19730+
1967319731
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
1967419732
{
1967519733
GGML_ASSERT(cplan);
@@ -19680,7 +19738,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1968019738
}
1968119739
}
1968219740

19683-
const int n_threads = cplan->n_threads;
19741+
int n_threads = cplan->n_threads;
19742+
19743+
#if defined(GGML_USE_OPENMP)
19744+
n_threads = MIN(n_threads, omp_get_max_threads());
19745+
#endif
1968419746

1968519747
struct ggml_compute_state_shared state_shared = {
1968619748
/*.cgraph =*/ cgraph,
@@ -19696,47 +19758,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
1969619758
/*.current_chunk; =*/ 0,
1969719759
};
1969819760
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
19699-
19700-
// create thread pool
19701-
if (n_threads > 1) {
19702-
for (int j = 1; j < n_threads; ++j) {
19703-
workers[j] = (struct ggml_compute_state) {
19704-
.thrd = 0,
19705-
.ith = j,
19706-
.shared = &state_shared,
19707-
.ec = GGML_STATUS_SUCCESS,
19708-
};
19709-
19710-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
19711-
GGML_ASSERT(rc == 0);
19712-
UNUSED(rc);
19713-
}
19714-
}
19715-
19716-
workers[0].ith = 0;
19717-
workers[0].shared = &state_shared;
19718-
workers[0].ec = GGML_STATUS_SUCCESS;
19719-
1972019761
const int64_t perf_start_cycles = ggml_perf_cycles();
1972119762
const int64_t perf_start_time_us = ggml_perf_time_us();
1972219763

19723-
// this is a work thread too
19724-
ggml_graph_compute_thread(&workers[0]);
19725-
enum ggml_status compute_status = workers[0].ec;
19726-
19727-
// don't leave affinity set on the main thread
19728-
clear_numa_thread_affinity();
19729-
19730-
// join or kill thread pool
19731-
if (n_threads > 1) {
19732-
for (int j = 1; j < n_threads; j++) {
19733-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
19734-
GGML_ASSERT(rc == 0);
19735-
if (workers[j].ec != GGML_STATUS_SUCCESS)
19736-
compute_status = workers[j].ec;
19737-
}
19764+
for (int j = 0; j < n_threads; ++j) {
19765+
workers[j] = (struct ggml_compute_state) {
19766+
.thrd = 0,
19767+
.ith = j,
19768+
.shared = &state_shared,
19769+
.ec = GGML_STATUS_SUCCESS,
19770+
};
1973819771
}
1973919772

19773+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
19774+
1974019775
// performance stats (graph)
1974119776
{
1974219777
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;

0 commit comments

Comments
 (0)