Skip to content

Commit 47edd35

Browse files
committed
OpenMP: remove repetitive thread creation using OpenMP
- Add OpenMP to dependency detection - Add GGML_NO_OMP if OpenMP is disabled or not found - Keep previous approach if there is no OMP
1 parent 1ea2a00 commit 47edd35

File tree

3 files changed

+72
-27
lines changed

3 files changed

+72
-27
lines changed

CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
123123
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
124124
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
125125
option(LLAMA_MPI "llama: use MPI" OFF)
126+
option(LLAMA_DISABLE_OMP "Disable OpenMP support" OFF)
126127
option(LLAMA_RPC "llama: use RPC" OFF)
127128
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
128129
option(LLAMA_SYCL "llama: use SYCL" OFF)
@@ -1242,6 +1243,25 @@ if (BUILD_SHARED_LIBS)
12421243
install(TARGETS ggml_shared LIBRARY)
12431244
endif()
12441245

1246+
1247+
if (NOT LLAMA_DISABLE_OMP)
1248+
find_package(OpenMP)
1249+
if(OpenMP_CXX_FOUND)
1250+
message(STATUS "OpenMP found")
1251+
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
1252+
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
1253+
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
1254+
target_link_libraries(ggml PUBLIC OpenMP::OpenMP_CXX)
1255+
else()
1256+
message(STATUS "OpenMP NOT found activating standard threading")
1257+
add_compile_definitions(GGML_NO_OMP)
1258+
endif()
1259+
else()
1260+
message(STATUS "OpenMP disabled activating standard threading")
1261+
add_compile_definitions(GGML_NO_OMP)
1262+
endif()
1263+
1264+
12451265
# llama
12461266

12471267
add_library(llama

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,14 @@ ifdef LLAMA_MPI
406406
OBJS += ggml-mpi.o
407407
endif # LLAMA_MPI
408408

409+
ifndef LLAMA_NO_OMP
410+
MK_CPPFLAGS+= -fopenmp
411+
MK_CFLAGS+= -fopenmp
412+
MK_LDFLAGS+= -fopenmp
413+
else
414+
MK_CPPFLAGS+= -DGGML_NO_OMP
415+
endif
416+
409417
ifdef LLAMA_OPENBLAS
410418
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
411419
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)

ggml.c

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;
1933419334

1933519335
#endif
1933619336

19337+
#ifdef GGML_NO_OMP
19338+
19339+
1933719340
// Android's libc implementation "bionic" does not support setting affinity
1933819341
#if defined(__gnu_linux__)
19342+
1933919343
static void set_numa_thread_affinity(int thread_n) {
1934019344
if (!ggml_is_numa()) {
1934119345
return;
@@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {
1940119405

1940219406
CPU_FREE(cpus);
1940319407
}
19408+
1940419409
#else
1940519410
// TODO: Windows etc.
1940619411
// (the linux implementation may also work on BSD, someone should test)
1940719412
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
1940819413
static void clear_numa_thread_affinity(void) {}
19414+
19415+
#endif
19416+
19417+
1940919418
#endif
1941019419

1941119420
static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
@@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
1971319722

1971419723
const int n_threads = state->shared->n_threads;
1971519724

19725+
#ifdef GGML_NO_OMP
1971619726
set_numa_thread_affinity(state->ith);
19727+
#endif
1971719728

1971819729
int node_n = -1;
1971919730
int task_phase = GGML_TASK_TYPE_FINALIZE;
@@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
2008620097
};
2008720098
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
2008820099

20089-
// create thread pool
20090-
if (n_threads > 1) {
20091-
for (int j = 1; j < n_threads; ++j) {
20092-
workers[j] = (struct ggml_compute_state) {
20093-
.thrd = 0,
20094-
.ith = j,
20095-
.shared = &state_shared,
20096-
.ec = GGML_STATUS_SUCCESS,
20097-
};
20100+
const int64_t perf_start_cycles = ggml_perf_cycles();
20101+
const int64_t perf_start_time_us = ggml_perf_time_us();
2009820102

20103+
/* Loop is reversed as in the NO_OMP case we want threads to start
20104+
before the main thread (j==0) */
20105+
#pragma omp parallel for shared(workers,state_shared)
20106+
for (int j = n_threads - 1; 0 <= j; j--) {
20107+
workers[j] = (struct ggml_compute_state) {
20108+
.ith = j,
20109+
.shared = &state_shared,
20110+
.ec = GGML_STATUS_SUCCESS,
20111+
};
20112+
20113+
#ifdef GGML_NO_OMP
20114+
if(j == 0)
20115+
{
20116+
/* No need to spawn a thread for main */
20117+
ggml_graph_compute_thread(&workers[j]);
20118+
}
20119+
else
20120+
{
2009920121
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
2010020122
GGML_ASSERT(rc == 0);
2010120123
UNUSED(rc);
2010220124
}
20125+
#else
20126+
ggml_graph_compute_thread(&workers[j]);
20127+
#endif
2010320128
}
2010420129

20105-
workers[0].ith = 0;
20106-
workers[0].shared = &state_shared;
20107-
workers[0].ec = GGML_STATUS_SUCCESS;
20108-
20109-
const int64_t perf_start_cycles = ggml_perf_cycles();
20110-
const int64_t perf_start_time_us = ggml_perf_time_us();
20130+
#ifdef GGML_NO_OMP
20131+
clear_numa_thread_affinity();
20132+
#endif
2011120133

20112-
// this is a work thread too
20113-
ggml_graph_compute_thread(&workers[0]);
2011420134
enum ggml_status compute_status = workers[0].ec;
2011520135

20116-
// don't leave affinity set on the main thread
20117-
clear_numa_thread_affinity();
20118-
2011920136
// join or kill thread pool
20120-
if (n_threads > 1) {
20121-
for (int j = 1; j < n_threads; j++) {
20122-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
20123-
GGML_ASSERT(rc == 0);
20124-
if (workers[j].ec != GGML_STATUS_SUCCESS)
20125-
compute_status = workers[j].ec;
20126-
}
20137+
for (int j = 1; j < n_threads; j++) {
20138+
#ifdef GGML_NO_OMP
20139+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
20140+
GGML_ASSERT(rc == 0);
20141+
#endif
20142+
if (workers[j].ec != GGML_STATUS_SUCCESS)
20143+
compute_status = workers[j].ec;
2012720144
}
2012820145

2012920146
// performance stats (graph)

0 commit comments

Comments
 (0)