Skip to content

Another threadpool: Avoid creating hundreds of threads in GGML #7342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_DISABLE_OMP "Disable OpenMP support" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
Expand Down Expand Up @@ -1211,6 +1213,25 @@ if (BUILD_SHARED_LIBS)
install(TARGETS ggml_shared LIBRARY)
endif()


if (NOT LLAMA_DISABLE_OMP)
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
message(STATUS "OpenMP found")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
target_link_libraries(ggml PUBLIC OpenMP::OpenMP_CXX)
else()
message(STATUS "OpenMP NOT found activating standard threading")
add_compile_definitions(GGML_NO_OMP)
endif()
else()
message(STATUS "OpenMP disabled activating standard threading")
add_compile_definitions(GGML_NO_OMP)
endif()


# llama

add_library(llama
Expand Down
15 changes: 15 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,21 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI

ifndef LLAMA_NO_OMP
MK_CPPFLAGS+= -fopenmp
MK_CFLAGS+= -fopenmp
MK_LDFLAGS+= -fopenmp
else
MK_CPPFLAGS+= -DGGML_NO_OMP
endif

ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
Expand Down
71 changes: 44 additions & 27 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -19334,8 +19334,12 @@ typedef int ggml_lock_t;

#endif

#ifdef GGML_NO_OMP


// Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__)

static void set_numa_thread_affinity(int thread_n) {
if (!ggml_is_numa()) {
return;
Expand Down Expand Up @@ -19401,11 +19405,16 @@ static void clear_numa_thread_affinity(void) {

CPU_FREE(cpus);
}

#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
static void clear_numa_thread_affinity(void) {}

#endif


#endif

static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
Expand Down Expand Up @@ -19713,7 +19722,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

const int n_threads = state->shared->n_threads;

#ifdef GGML_NO_OMP
set_numa_thread_affinity(state->ith);
#endif

int node_n = -1;
int task_phase = GGML_TASK_TYPE_FINALIZE;
Expand Down Expand Up @@ -20086,44 +20097,50 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
};
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);

// create thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; ++j) {
workers[j] = (struct ggml_compute_state) {
.thrd = 0,
.ith = j,
.shared = &state_shared,
.ec = GGML_STATUS_SUCCESS,
};
const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();

/* Loop is reversed as in the NO_OMP case we want threads to start
before the main thread (j==0) */
#pragma omp parallel for shared(workers,state_shared)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might need to add a num_threads(n_threads) here to make sure that omp always launches all the threads, otherwise it will deadlock.

Copy link
Author

@besnardjb besnardjb May 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right it is indeed fragile. I'm not even sure setting thread count covers all cases (https://www.openmp.org/spec-html/5.0/openmpsu35.html#x55-880002.6.1).

for (int j = n_threads - 1; 0 <= j; j--) {
workers[j] = (struct ggml_compute_state) {
.ith = j,
.shared = &state_shared,
.ec = GGML_STATUS_SUCCESS,
};

#ifdef GGML_NO_OMP
if(j == 0)
{
/* No need to spawn a thread for main */
ggml_graph_compute_thread(&workers[j]);
}
else
{
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
GGML_ASSERT(rc == 0);
UNUSED(rc);
}
#else
ggml_graph_compute_thread(&workers[j]);
#endif
}

workers[0].ith = 0;
workers[0].shared = &state_shared;
workers[0].ec = GGML_STATUS_SUCCESS;

const int64_t perf_start_cycles = ggml_perf_cycles();
const int64_t perf_start_time_us = ggml_perf_time_us();
#ifdef GGML_NO_OMP
clear_numa_thread_affinity();
#endif

// this is a work thread too
ggml_graph_compute_thread(&workers[0]);
enum ggml_status compute_status = workers[0].ec;

// don't leave affinity set on the main thread
clear_numa_thread_affinity();

// join or kill thread pool
if (n_threads > 1) {
for (int j = 1; j < n_threads; j++) {
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
}
for (int j = 1; j < n_threads; j++) {
#ifdef GGML_NO_OMP
const int rc = ggml_thread_join(workers[j].thrd, NULL);
GGML_ASSERT(rc == 0);
#endif
if (workers[j].ec != GGML_STATUS_SUCCESS)
compute_status = workers[j].ec;
}

// performance stats (graph)
Expand Down
Loading