From 21e88c8b0ff2979f416c5b2e240c65acadcf5fb2 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 1 Apr 2023 20:16:36 +0200 Subject: [PATCH 01/14] run sanitizers in release, otherwise too slow (#5) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 88e70e4959914..482fc64578586 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -326,7 +326,7 @@ jobs: # sudo apt-get install cmake # # - name: Configure -# run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON +# run: cmake . -DCMAKE_BUILD_TYPE=RelWithDebInfo -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON # # - name: Build # run: | From a65d37ad36f8c5be36ccb8f0d1ddddfd67cab777 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Fri, 31 Mar 2023 21:03:48 +0200 Subject: [PATCH 02/14] using github Pithikos/C-Thread-Pool for threading --- CMakeLists.txt | 5 +- Makefile | 17 +- ggml.c | 234 +++++++++------------ thpool.c | 553 +++++++++++++++++++++++++++++++++++++++++++++++++ thpool.h | 187 +++++++++++++++++ 5 files changed, 855 insertions(+), 141 deletions(-) create mode 100644 thpool.c create mode 100644 thpool.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 37f22700bc9aa..e3316f5b0133e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,10 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + thpool.c + thpool.h + ) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump diff --git a/Makefile b/Makefile index 83a4514ef7177..dd0c9e45a2365 100644 --- a/Makefile +++ b/Makefile @@ -225,6 +225,9 @@ default: main quantize perplexity embedding # Build library # +thpool.o: thpool.c thpool.h + $(CC) $(CFLAGS) -c thpool.c -o thpool.o + ggml.o: ggml.c ggml.h $(CC) $(CFLAGS) -c ggml.c -o ggml.o @@ -237,20 +240,20 @@ common.o: examples/common.cpp examples/common.h clean: rm -vf *.o main quantize perplexity embedding -main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) +main: examples/main/main.cpp thpool.o ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/main/main.cpp thpool.o ggml.o llama.o common.o -o main $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp thpool.o ggml.o llama.o + $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp thpool.o ggml.o llama.o -o quantize $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) + $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp thpool.o ggml.o llama.o common.o -o perplexity $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +embedding: examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o -o embedding $(LDFLAGS) # # Tests diff --git a/ggml.c b/ggml.c index 25fa726320df2..58e51159c195f 100644 --- a/ggml.c +++ b/ggml.c @@ -3,6 +3,8 @@ #include "ggml.h" +#include "thpool.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) @@ -72,6 +74,59 @@ static int sched_yield (void) { Sleep (0); return 0; } + +typedef struct pthread_mutex_tag { + CRITICAL_SECTION critical_section; +} pthread_mutex_t; + +typedef struct pthread_mutexattr_tag { + int attr; +} pthread_mutexattr_t; + +int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { + InitializeCriticalSection (&mutex->critical_section); + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t * mutex) { + DeleteCriticalSection(&mutex->critical_section); + return 0; +} + + +int pthread_mutex_lock(pthread_mutex_t * mutex) { + EnterCriticalSection(&mutex->critical_section); + return 0; +} + +int pthread_mutex_unlock(pthread_mutex_t * mutex) { + LeaveCriticalSection(&mutex->critical_section); + return 0; +} + +typedef struct pthread_cond_tag { + CONDITION_VARIABLE cond; +} pthread_cond_t; + +int pthread_cond_init(pthread_cond_t * cond, void * unused) { + InitializeConditionVariable (&cond->cond); + return 0; +} + +int pthread_cond_destroy(pthread_cond_t * cond) { + return 0; +} + +int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { + SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); + return 0; +} + +int pthread_cond_broadcast(pthread_cond_t * cond) { + WakeAllConditionVariable(&cond->cond); + return 0; +} + #else #include #include @@ -2538,6 +2593,7 @@ struct ggml_context { struct ggml_scratch scratch; struct ggml_scratch scratch_save; + threadpool tpool; }; struct ggml_context_container { @@ -2822,6 +2878,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { /*.objects_end =*/ NULL, /*.scratch =*/ { 0, 0, NULL, }, /*.scratch_save =*/ { 0, 0, NULL, }, + /*.thpool =*/ NULL, }; GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure @@ -8954,6 +9011,19 @@ typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join +typedef pthread_mutex_t ggml_mutex_t; +typedef pthread_cond_t ggml_cond_t; + +#define ggml_mutex_init pthread_mutex_init +#define ggml_mutex_destroy pthread_mutex_destroy +#define ggml_cond_init pthread_cond_init +#define ggml_cond_destroy pthread_cond_destroy + +#define ggml_mutex_lock pthread_mutex_lock +#define ggml_mutex_unlock pthread_mutex_unlock +#define ggml_cond_broadcast pthread_cond_broadcast +#define ggml_cond_wait pthread_cond_wait + #else //typedef pthread_spinlock_t ggml_lock_t; @@ -8977,17 +9047,31 @@ typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join +typedef pthread_mutex_t ggml_mutex_t; +typedef pthread_cond_t ggml_cond_t; + +#define ggml_mutex_init pthread_mutex_init +#define ggml_mutex_destroy pthread_mutex_destroy +#define ggml_cond_init pthread_cond_init +#define ggml_cond_destroy pthread_cond_destroy + +#define ggml_mutex_lock pthread_mutex_lock +#define ggml_mutex_unlock pthread_mutex_unlock +#define ggml_cond_broadcast pthread_cond_broadcast +#define ggml_cond_wait pthread_cond_wait + #endif struct ggml_compute_state_shared { - ggml_lock_t spin; int n_threads; // synchronization primitives - atomic_int n_ready; - atomic_bool has_work; - atomic_bool stop; // stop all threads + int n_ready; + bool has_work; + bool stop; // stop all threads + ggml_mutex_t mutex; + ggml_cond_t cond; }; struct ggml_compute_state { @@ -8999,72 +9083,31 @@ struct ggml_compute_state { struct ggml_compute_state_shared * shared; }; -static thread_ret_t ggml_graph_compute_thread(void * data) { +static void ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - - const int n_threads = state->shared->n_threads; - - while (true) { - if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { - atomic_store(&state->shared->has_work, false); - } else { - while (atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - } - - atomic_fetch_sub(&state->shared->n_ready, 1); - - // wait for work - while (!atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - - // check if we should stop - if (atomic_load(&state->shared->stop)) { - break; - } - - if (state->node) { - if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); - } - - state->node = NULL; - } else { - break; + if (state->node) { + if (state->params.ith < state->params.nth) { + ggml_compute_forward(&state->params, state->node); } + state->node = NULL; } - - return 0; } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { const int n_threads = cgraph->n_threads; - struct ggml_compute_state_shared state_shared = { - /*.spin =*/ GGML_LOCK_INITIALIZER, /*.n_threads =*/ n_threads, /*.n_ready =*/ 0, /*.has_work =*/ false, /*.stop =*/ false, + /*.mutex =*/ {0}, + /*.cond =*/ {0}, }; struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; // create thread pool if (n_threads > 1) { - ggml_lock_init(&state_shared.spin); - - atomic_store(&state_shared.has_work, true); - + ctx->tpool = thpool_init(n_threads); for (int j = 0; j < n_threads - 1; j++) { workers[j] = (struct ggml_compute_state) { .thrd = 0, @@ -9078,10 +9121,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .node = NULL, .shared = &state_shared, }; - - int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); } } @@ -9319,15 +9358,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // COMPUTE if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - // launch thread pool for (int j = 0; j < n_threads - 1; j++) { workers[j].params = (struct ggml_compute_params) { @@ -9338,16 +9368,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); } params.type = GGML_TASK_COMPUTE; @@ -9355,34 +9377,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // wait for thread pool if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } + thpool_wait(ctx->tpool); } // FINALIZE if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - // launch thread pool for (int j = 0; j < n_threads - 1; j++) { workers[j].params = (struct ggml_compute_params) { @@ -9393,16 +9392,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .wdata = cgraph->work ? cgraph->work->data : NULL, }; workers[j].node = node; + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); } params.type = GGML_TASK_FINALIZE; @@ -9410,21 +9401,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // wait for thread pool if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } + thpool_wait(ctx->tpool); } // performance stats (node) @@ -9440,16 +9417,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // join thread pool if (n_threads > 1) { - atomic_store(&state_shared.stop, true); - atomic_store(&state_shared.has_work, true); - - for (int j = 0; j < n_threads - 1; j++) { - int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - ggml_lock_destroy(&state_shared.spin); + thpool_destroy(ctx->tpool); } // performance stats (graph) diff --git a/thpool.c b/thpool.c new file mode 100644 index 0000000000000..59e2e5556780d --- /dev/null +++ b/thpool.c @@ -0,0 +1,553 @@ +/* ******************************** + * Author: Johan Hanssen Seferidis + * License: MIT + * Description: Library providing a threading pool where you can add + * work. For usage, check the thpool.h file or README.md + * + *//** @file thpool.h *//* + * + ********************************/ + +#if defined(__APPLE__) +#include +#else +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#endif +#include +#include +#include +#include +#include +#include +#include +#if defined(__linux__) +#include +#endif + +#include "thpool.h" + +#ifdef THPOOL_DEBUG +#define THPOOL_DEBUG 1 +#else +#define THPOOL_DEBUG 0 +#endif + +#if !defined(DISABLE_PRINT) || defined(THPOOL_DEBUG) +#define err(str) fprintf(stderr, str) +#else +#define err(str) +#endif + +static volatile int threads_keepalive; +static volatile int threads_on_hold; + + + +/* ========================== STRUCTURES ============================ */ + + +/* Binary semaphore */ +typedef struct bsem { + pthread_mutex_t mutex; + pthread_cond_t cond; + int v; +} bsem; + + +/* Job */ +typedef struct job{ + struct job* prev; /* pointer to previous job */ + void (*function)(void* arg); /* function pointer */ + void* arg; /* function's argument */ +} job; + + +/* Job queue */ +typedef struct jobqueue{ + pthread_mutex_t rwmutex; /* used for queue r/w access */ + job *front; /* pointer to front of queue */ + job *rear; /* pointer to rear of queue */ + bsem *has_jobs; /* flag as binary semaphore */ + int len; /* number of jobs in queue */ +} jobqueue; + + +/* Thread */ +typedef struct thread{ + int id; /* friendly id */ + pthread_t pthread; /* pointer to actual thread */ + struct thpool_* thpool_p; /* access to thpool */ +} thread; + + +/* Threadpool */ +typedef struct thpool_{ + thread** threads; /* pointer to threads */ + volatile int num_threads_alive; /* threads currently alive */ + volatile int num_threads_working; /* threads currently working */ + pthread_mutex_t thcount_lock; /* used for thread count etc */ + pthread_cond_t threads_all_idle; /* signal to thpool_wait */ + jobqueue jobqueue; /* job queue */ +} thpool_; + + + + + +/* ========================== PROTOTYPES ============================ */ + + +static int thread_init(thpool_* thpool_p, struct thread** thread_p, int id); +static void* thread_do(struct thread* thread_p); +static void thread_hold(int sig_id); +static void thread_destroy(struct thread* thread_p); + +static int jobqueue_init(jobqueue* jobqueue_p); +static void jobqueue_clear(jobqueue* jobqueue_p); +static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob_p); +static struct job* jobqueue_pull(jobqueue* jobqueue_p); +static void jobqueue_destroy(jobqueue* jobqueue_p); + +static void bsem_init(struct bsem *bsem_p, int value); +static void bsem_reset(struct bsem *bsem_p); +static void bsem_post(struct bsem *bsem_p); +static void bsem_post_all(struct bsem *bsem_p); +static void bsem_wait(struct bsem *bsem_p); + + + + + +/* ========================== THREADPOOL ============================ */ + + +/* Initialise thread pool */ +struct thpool_* thpool_init(int num_threads){ + + threads_on_hold = 0; + threads_keepalive = 1; + + if (num_threads < 0){ + num_threads = 0; + } + + /* Make new thread pool */ + thpool_* thpool_p; + thpool_p = (struct thpool_*)malloc(sizeof(struct thpool_)); + if (thpool_p == NULL){ + err("thpool_init(): Could not allocate memory for thread pool\n"); + return NULL; + } + thpool_p->num_threads_alive = 0; + thpool_p->num_threads_working = 0; + + /* Initialise the job queue */ + if (jobqueue_init(&thpool_p->jobqueue) == -1){ + err("thpool_init(): Could not allocate memory for job queue\n"); + free(thpool_p); + return NULL; + } + + /* Make threads in pool */ + thpool_p->threads = (struct thread**)malloc(num_threads * sizeof(struct thread *)); + if (thpool_p->threads == NULL){ + err("thpool_init(): Could not allocate memory for threads\n"); + jobqueue_destroy(&thpool_p->jobqueue); + free(thpool_p); + return NULL; + } + + pthread_mutex_init(&(thpool_p->thcount_lock), NULL); + pthread_cond_init(&thpool_p->threads_all_idle, NULL); + + /* Thread init */ + int n; + for (n=0; nthreads[n], n); +#if THPOOL_DEBUG + printf("THPOOL_DEBUG: Created thread %d in pool \n", n); +#endif + } + + /* Wait for threads to initialize */ + while (thpool_p->num_threads_alive != num_threads) {} + + return thpool_p; +} + + +/* Add work to the thread pool */ +int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){ + job* newjob; + + newjob=(struct job*)malloc(sizeof(struct job)); + if (newjob==NULL){ + err("thpool_add_work(): Could not allocate memory for new job\n"); + return -1; + } + + /* add function and argument */ + newjob->function=function_p; + newjob->arg=arg_p; + + /* add job to queue */ + jobqueue_push(&thpool_p->jobqueue, newjob); + + return 0; +} + + +/* Wait until all jobs have finished */ +void thpool_wait(thpool_* thpool_p){ + pthread_mutex_lock(&thpool_p->thcount_lock); + while (thpool_p->jobqueue.len || thpool_p->num_threads_working) { + pthread_cond_wait(&thpool_p->threads_all_idle, &thpool_p->thcount_lock); + } + pthread_mutex_unlock(&thpool_p->thcount_lock); +} + + +/* Destroy the threadpool */ +void thpool_destroy(thpool_* thpool_p){ + /* No need to destroy if it's NULL */ + if (thpool_p == NULL) return ; + + volatile int threads_total = thpool_p->num_threads_alive; + + /* End each thread 's infinite loop */ + threads_keepalive = 0; + + /* Give one second to kill idle threads */ + double TIMEOUT = 1.0; + time_t start, end; + double tpassed = 0.0; + time (&start); + while (tpassed < TIMEOUT && thpool_p->num_threads_alive){ + bsem_post_all(thpool_p->jobqueue.has_jobs); + time (&end); + tpassed = difftime(end,start); + } + + /* Poll remaining threads */ + while (thpool_p->num_threads_alive){ + bsem_post_all(thpool_p->jobqueue.has_jobs); + sleep(1); + } + + /* Job queue cleanup */ + jobqueue_destroy(&thpool_p->jobqueue); + /* Deallocs */ + int n; + for (n=0; n < threads_total; n++){ + thread_destroy(thpool_p->threads[n]); + } + free(thpool_p->threads); + free(thpool_p); +} + + +/* Pause all threads in threadpool */ +void thpool_pause(thpool_* thpool_p) { + int n; + for (n=0; n < thpool_p->num_threads_alive; n++){ + pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1); + } +} + + +/* Resume all threads in threadpool */ +void thpool_resume(thpool_* thpool_p) { + // resuming a single threadpool hasn't been + // implemented yet, meanwhile this suppresses + // the warnings + (void)thpool_p; + + threads_on_hold = 0; +} + + +int thpool_num_threads_working(thpool_* thpool_p){ + return thpool_p->num_threads_working; +} + + + + + +/* ============================ THREAD ============================== */ + + +/* Initialize a thread in the thread pool + * + * @param thread address to the pointer of the thread to be created + * @param id id to be given to the thread + * @return 0 on success, -1 otherwise. + */ +static int thread_init (thpool_* thpool_p, struct thread** thread_p, int id){ + + *thread_p = (struct thread*)malloc(sizeof(struct thread)); + if (*thread_p == NULL){ + err("thread_init(): Could not allocate memory for thread\n"); + return -1; + } + + (*thread_p)->thpool_p = thpool_p; + (*thread_p)->id = id; + + pthread_create(&(*thread_p)->pthread, NULL, (void * (*)(void *)) thread_do, (*thread_p)); + pthread_detach((*thread_p)->pthread); + return 0; +} + + +/* Sets the calling thread on hold */ +static void thread_hold(int sig_id) { + (void)sig_id; + threads_on_hold = 1; + while (threads_on_hold){ + sleep(1); + } +} + + +/* What each thread is doing +* +* In principle this is an endless loop. The only time this loop gets interuppted is once +* thpool_destroy() is invoked or the program exits. +* +* @param thread thread that will run this function +* @return nothing +*/ +static void* thread_do(struct thread* thread_p){ + + /* Set thread name for profiling and debugging */ + char thread_name[16] = {0}; + snprintf(thread_name, 16, "thpool-%d", thread_p->id); + +#if defined(__linux__) + /* Use prctl instead to prevent using _GNU_SOURCE flag and implicit declaration */ + prctl(PR_SET_NAME, thread_name); +#elif defined(__APPLE__) && defined(__MACH__) + pthread_setname_np(thread_name); +#else + err("thread_do(): pthread_setname_np is not supported on this system"); +#endif + + /* Assure all threads have been created before starting serving */ + thpool_* thpool_p = thread_p->thpool_p; + + /* Register signal handler */ + struct sigaction act; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = thread_hold; + if (sigaction(SIGUSR1, &act, NULL) == -1) { + err("thread_do(): cannot handle SIGUSR1"); + } + + /* Mark thread as alive (initialized) */ + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_alive += 1; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + while(threads_keepalive){ + + bsem_wait(thpool_p->jobqueue.has_jobs); + + if (threads_keepalive){ + + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_working++; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + /* Read job from queue and execute it */ + void (*func_buff)(void*); + void* arg_buff; + job* job_p = jobqueue_pull(&thpool_p->jobqueue); + if (job_p) { + func_buff = job_p->function; + arg_buff = job_p->arg; + func_buff(arg_buff); + free(job_p); + } + + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_working--; + if (!thpool_p->num_threads_working) { + pthread_cond_signal(&thpool_p->threads_all_idle); + } + pthread_mutex_unlock(&thpool_p->thcount_lock); + + } + } + pthread_mutex_lock(&thpool_p->thcount_lock); + thpool_p->num_threads_alive --; + pthread_mutex_unlock(&thpool_p->thcount_lock); + + return NULL; +} + + +/* Frees a thread */ +static void thread_destroy (thread* thread_p){ + free(thread_p); +} + + + + + +/* ============================ JOB QUEUE =========================== */ + + +/* Initialize queue */ +static int jobqueue_init(jobqueue* jobqueue_p){ + jobqueue_p->len = 0; + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + + jobqueue_p->has_jobs = (struct bsem*)malloc(sizeof(struct bsem)); + if (jobqueue_p->has_jobs == NULL){ + return -1; + } + + pthread_mutex_init(&(jobqueue_p->rwmutex), NULL); + bsem_init(jobqueue_p->has_jobs, 0); + + return 0; +} + + +/* Clear the queue */ +static void jobqueue_clear(jobqueue* jobqueue_p){ + + while(jobqueue_p->len){ + free(jobqueue_pull(jobqueue_p)); + } + + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + bsem_reset(jobqueue_p->has_jobs); + jobqueue_p->len = 0; + +} + + +/* Add (allocated) job to queue + */ +static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob){ + + pthread_mutex_lock(&jobqueue_p->rwmutex); + newjob->prev = NULL; + + switch(jobqueue_p->len){ + + case 0: /* if no jobs in queue */ + jobqueue_p->front = newjob; + jobqueue_p->rear = newjob; + break; + + default: /* if jobs in queue */ + jobqueue_p->rear->prev = newjob; + jobqueue_p->rear = newjob; + + } + jobqueue_p->len++; + + bsem_post(jobqueue_p->has_jobs); + pthread_mutex_unlock(&jobqueue_p->rwmutex); +} + + +/* Get first job from queue(removes it from queue) + * Notice: Caller MUST hold a mutex + */ +static struct job* jobqueue_pull(jobqueue* jobqueue_p){ + + pthread_mutex_lock(&jobqueue_p->rwmutex); + job* job_p = jobqueue_p->front; + + switch(jobqueue_p->len){ + + case 0: /* if no jobs in queue */ + break; + + case 1: /* if one job in queue */ + jobqueue_p->front = NULL; + jobqueue_p->rear = NULL; + jobqueue_p->len = 0; + break; + + default: /* if >1 jobs in queue */ + jobqueue_p->front = job_p->prev; + jobqueue_p->len--; + /* more than one job in queue -> post it */ + bsem_post(jobqueue_p->has_jobs); + + } + + pthread_mutex_unlock(&jobqueue_p->rwmutex); + return job_p; +} + + +/* Free all queue resources back to the system */ +static void jobqueue_destroy(jobqueue* jobqueue_p){ + jobqueue_clear(jobqueue_p); + free(jobqueue_p->has_jobs); +} + + + + + +/* ======================== SYNCHRONISATION ========================= */ + + +/* Init semaphore to 1 or 0 */ +static void bsem_init(bsem *bsem_p, int value) { + if (value < 0 || value > 1) { + err("bsem_init(): Binary semaphore can take only values 1 or 0"); + exit(1); + } + pthread_mutex_init(&(bsem_p->mutex), NULL); + pthread_cond_init(&(bsem_p->cond), NULL); + bsem_p->v = value; +} + + +/* Reset semaphore to 0 */ +static void bsem_reset(bsem *bsem_p) { + bsem_init(bsem_p, 0); +} + + +/* Post to at least one thread */ +static void bsem_post(bsem *bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + bsem_p->v = 1; + pthread_cond_signal(&bsem_p->cond); + pthread_mutex_unlock(&bsem_p->mutex); +} + + +/* Post to all threads */ +static void bsem_post_all(bsem *bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + bsem_p->v = 1; + pthread_cond_broadcast(&bsem_p->cond); + pthread_mutex_unlock(&bsem_p->mutex); +} + + +/* Wait on semaphore until semaphore has value 0 */ +static void bsem_wait(bsem* bsem_p) { + pthread_mutex_lock(&bsem_p->mutex); + while (bsem_p->v != 1) { + pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex); + } + bsem_p->v = 0; + pthread_mutex_unlock(&bsem_p->mutex); +} diff --git a/thpool.h b/thpool.h new file mode 100644 index 0000000000000..af3e68d165fb0 --- /dev/null +++ b/thpool.h @@ -0,0 +1,187 @@ +/********************************** + * @author Johan Hanssen Seferidis + * License: MIT + * + **********************************/ + +#ifndef _THPOOL_ +#define _THPOOL_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* =================================== API ======================================= */ + + +typedef struct thpool_* threadpool; + + +/** + * @brief Initialize threadpool + * + * Initializes a threadpool. This function will not return until all + * threads have initialized successfully. + * + * @example + * + * .. + * threadpool thpool; //First we declare a threadpool + * thpool = thpool_init(4); //then we initialize it to 4 threads + * .. + * + * @param num_threads number of threads to be created in the threadpool + * @return threadpool created threadpool on success, + * NULL on error + */ +threadpool thpool_init(int num_threads); + + +/** + * @brief Add work to the job queue + * + * Takes an action and its argument and adds it to the threadpool's job queue. + * If you want to add to work a function with more than one arguments then + * a way to implement this is by passing a pointer to a structure. + * + * NOTICE: You have to cast both the function and argument to not get warnings. + * + * @example + * + * void print_num(int num){ + * printf("%d\n", num); + * } + * + * int main() { + * .. + * int a = 10; + * thpool_add_work(thpool, (void*)print_num, (void*)a); + * .. + * } + * + * @param threadpool threadpool to which the work will be added + * @param function_p pointer to function to add as work + * @param arg_p pointer to an argument + * @return 0 on success, -1 otherwise. + */ +int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p); + + +/** + * @brief Wait for all queued jobs to finish + * + * Will wait for all jobs - both queued and currently running to finish. + * Once the queue is empty and all work has completed, the calling thread + * (probably the main program) will continue. + * + * Smart polling is used in wait. The polling is initially 0 - meaning that + * there is virtually no polling at all. If after 1 seconds the threads + * haven't finished, the polling interval starts growing exponentially + * until it reaches max_secs seconds. Then it jumps down to a maximum polling + * interval assuming that heavy processing is being used in the threadpool. + * + * @example + * + * .. + * threadpool thpool = thpool_init(4); + * .. + * // Add a bunch of work + * .. + * thpool_wait(thpool); + * puts("All added work has finished"); + * .. + * + * @param threadpool the threadpool to wait for + * @return nothing + */ +void thpool_wait(threadpool); + + +/** + * @brief Pauses all threads immediately + * + * The threads will be paused no matter if they are idle or working. + * The threads return to their previous states once thpool_resume + * is called. + * + * While the thread is being paused, new work can be added. + * + * @example + * + * threadpool thpool = thpool_init(4); + * thpool_pause(thpool); + * .. + * // Add a bunch of work + * .. + * thpool_resume(thpool); // Let the threads start their magic + * + * @param threadpool the threadpool where the threads should be paused + * @return nothing + */ +void thpool_pause(threadpool); + + +/** + * @brief Unpauses all threads if they are paused + * + * @example + * .. + * thpool_pause(thpool); + * sleep(10); // Delay execution 10 seconds + * thpool_resume(thpool); + * .. + * + * @param threadpool the threadpool where the threads should be unpaused + * @return nothing + */ +void thpool_resume(threadpool); + + +/** + * @brief Destroy the threadpool + * + * This will wait for the currently active threads to finish and then 'kill' + * the whole threadpool to free up memory. + * + * @example + * int main() { + * threadpool thpool1 = thpool_init(2); + * threadpool thpool2 = thpool_init(2); + * .. + * thpool_destroy(thpool1); + * .. + * return 0; + * } + * + * @param threadpool the threadpool to destroy + * @return nothing + */ +void thpool_destroy(threadpool); + + +/** + * @brief Show currently working threads + * + * Working threads are the threads that are performing work (not idle). + * + * @example + * int main() { + * threadpool thpool1 = thpool_init(2); + * threadpool thpool2 = thpool_init(2); + * .. + * printf("Working threads: %d\n", thpool_num_threads_working(thpool1)); + * .. + * return 0; + * } + * + * @param threadpool the threadpool of interest + * @return integer number of threads working + */ +int thpool_num_threads_working(threadpool); + + +#ifdef __cplusplus +} +#endif + +#endif From d3bc4df97db470d4e719bab387bb546bc5dabc70 Mon Sep 17 00:00:00 2001 From: Vladimir Date: Sat, 1 Apr 2023 19:45:33 +0200 Subject: [PATCH 03/14] fix windows build --- ggml.c | 89 ----------------------------------------- thpool.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 102 deletions(-) diff --git a/ggml.c b/ggml.c index 58e51159c195f..1c3e28d37c4ec 100644 --- a/ggml.c +++ b/ggml.c @@ -52,81 +52,11 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void* unused) { - return (int) WaitForSingleObject(thread, INFINITE); -} - static int sched_yield (void) { Sleep (0); return 0; } -typedef struct pthread_mutex_tag { - CRITICAL_SECTION critical_section; -} pthread_mutex_t; - -typedef struct pthread_mutexattr_tag { - int attr; -} pthread_mutexattr_t; - -int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { - InitializeCriticalSection (&mutex->critical_section); - return 0; -} - -int pthread_mutex_destroy(pthread_mutex_t * mutex) { - DeleteCriticalSection(&mutex->critical_section); - return 0; -} - - -int pthread_mutex_lock(pthread_mutex_t * mutex) { - EnterCriticalSection(&mutex->critical_section); - return 0; -} - -int pthread_mutex_unlock(pthread_mutex_t * mutex) { - LeaveCriticalSection(&mutex->critical_section); - return 0; -} - -typedef struct pthread_cond_tag { - CONDITION_VARIABLE cond; -} pthread_cond_t; - -int pthread_cond_init(pthread_cond_t * cond, void * unused) { - InitializeConditionVariable (&cond->cond); - return 0; -} - -int pthread_cond_destroy(pthread_cond_t * cond) { - return 0; -} - -int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { - SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); - return 0; -} - -int pthread_cond_broadcast(pthread_cond_t * cond) { - WakeAllConditionVariable(&cond->cond); - return 0; -} - #else #include #include @@ -9042,14 +8972,10 @@ typedef int ggml_lock_t; #define GGML_LOCK_INITIALIZER 0 -typedef pthread_t ggml_thread_t; #define ggml_thread_create pthread_create #define ggml_thread_join pthread_join -typedef pthread_mutex_t ggml_mutex_t; -typedef pthread_cond_t ggml_cond_t; - #define ggml_mutex_init pthread_mutex_init #define ggml_mutex_destroy pthread_mutex_destroy #define ggml_cond_init pthread_cond_init @@ -9063,19 +8989,10 @@ typedef pthread_cond_t ggml_cond_t; #endif struct ggml_compute_state_shared { - int n_threads; - - // synchronization primitives - int n_ready; - bool has_work; - bool stop; // stop all threads - ggml_mutex_t mutex; - ggml_cond_t cond; }; struct ggml_compute_state { - ggml_thread_t thrd; struct ggml_compute_params params; struct ggml_tensor * node; @@ -9097,11 +9014,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int n_threads = cgraph->n_threads; struct ggml_compute_state_shared state_shared = { /*.n_threads =*/ n_threads, - /*.n_ready =*/ 0, - /*.has_work =*/ false, - /*.stop =*/ false, - /*.mutex =*/ {0}, - /*.cond =*/ {0}, }; struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; @@ -9110,7 +9022,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) ctx->tpool = thpool_init(n_threads); for (int j = 0; j < n_threads - 1; j++) { workers[j] = (struct ggml_compute_state) { - .thrd = 0, .params = { .type = GGML_TASK_COMPUTE, .ith = j + 1, diff --git a/thpool.c b/thpool.c index 59e2e5556780d..76cf5fc3fc97f 100644 --- a/thpool.c +++ b/thpool.c @@ -15,11 +15,111 @@ #define _POSIX_C_SOURCE 200809L #endif #endif +/* +this is not part of original thpool, thats me hacking it to work on windows +*/ +#if defined _MSC_VER || defined(__MINGW32__) +#if !defined(__MINGW32__) +#include +#else +// ref: https://github.com/ggerganov/whisper.cpp/issues/168 +#include +#endif + +unsigned int sleep(unsigned int seconds) { + Sleep(seconds * 1000); + return 0; +} + +typedef HANDLE pthread_t; + +typedef DWORD thread_ret_t; +static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { + HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); + if (handle == NULL) + { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void* unused) { + return (int) WaitForSingleObject(thread, INFINITE); +} + +static int pthread_detach(pthread_t thread) { + CloseHandle(thread); + return 0; +} + +typedef struct pthread_mutex_tag { + CRITICAL_SECTION critical_section; +} pthread_mutex_t; + +typedef struct pthread_mutexattr_tag { + int attr; +} pthread_mutexattr_t; + +int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) { + InitializeCriticalSection (&mutex->critical_section); + return 0; +} + +int pthread_mutex_destroy(pthread_mutex_t * mutex) { + DeleteCriticalSection(&mutex->critical_section); + return 0; +} + + +int pthread_mutex_lock(pthread_mutex_t * mutex) { + EnterCriticalSection(&mutex->critical_section); + return 0; +} + +int pthread_mutex_unlock(pthread_mutex_t * mutex) { + LeaveCriticalSection(&mutex->critical_section); + return 0; +} + +typedef struct pthread_cond_tag { + CONDITION_VARIABLE cond; +} pthread_cond_t; + +int pthread_cond_init(pthread_cond_t * cond, void * unused) { + InitializeConditionVariable (&cond->cond); + return 0; +} + +int pthread_cond_destroy(pthread_cond_t * cond) { + return 0; +} + +int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) { + SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE); + return 0; +} + +int pthread_cond_broadcast(pthread_cond_t * cond) { + WakeAllConditionVariable(&cond->cond); + return 0; +} + +int pthread_cond_signal(pthread_cond_t * cond) { + WakeConditionVariable(&cond->cond); + return 0; +} +#else #include +#include +#endif + #include #include #include -#include + + #include #include #if defined(__linux__) @@ -247,16 +347,6 @@ void thpool_destroy(thpool_* thpool_p){ free(thpool_p); } - -/* Pause all threads in threadpool */ -void thpool_pause(thpool_* thpool_p) { - int n; - for (n=0; n < thpool_p->num_threads_alive; n++){ - pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1); - } -} - - /* Resume all threads in threadpool */ void thpool_resume(thpool_* thpool_p) { // resuming a single threadpool hasn't been @@ -332,20 +422,22 @@ static void* thread_do(struct thread* thread_p){ #elif defined(__APPLE__) && defined(__MACH__) pthread_setname_np(thread_name); #else - err("thread_do(): pthread_setname_np is not supported on this system"); + // err("thread_do(): pthread_setname_np is not supported on this system"); #endif /* Assure all threads have been created before starting serving */ thpool_* thpool_p = thread_p->thpool_p; /* Register signal handler */ + /* + ///// HACK struct sigaction act; sigemptyset(&act.sa_mask); act.sa_flags = 0; act.sa_handler = thread_hold; if (sigaction(SIGUSR1, &act, NULL) == -1) { err("thread_do(): cannot handle SIGUSR1"); - } + }*/ /* Mark thread as alive (initialized) */ pthread_mutex_lock(&thpool_p->thcount_lock); From 997c749065bad8f93bfcc92e8220f0c8acbba633 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 1 Apr 2023 16:32:14 +0800 Subject: [PATCH 04/14] Add detection code for avx --- CMakeLists.txt | 108 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a434f07bba0c..8d68ad183bdf0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,114 @@ option(LLAMA_OPENBLAS "llama: use OpenBLAS" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +INCLUDE(CheckCSourceRuns) + +SET(AVX_CODE " + #include + int main() + { + __m256 a; + a = _mm256_set1_ps(0); + return 0; + } +") + +SET(AVX512_CODE " + #include + int main() + { + __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0); + __m512i b = a; + __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ); + return 0; + } +") + +SET(AVX2_CODE " + #include + int main() + { + __m256i a = {0}; + a = _mm256_abs_epi16(a); + __m256i x; + _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code + return 0; + } +") + +SET(FMA_CODE " + #include + int main() + { + __m256 acc = _mm256_setzero_ps(); + const __m256 d = _mm256_setzero_ps(); + const __m256 p = _mm256_setzero_ps(); + acc = _mm256_fmadd_ps( d, p, acc ); + return 0; + } +") + +MACRO(CHECK_SSE type flags) + SET(__FLAG_I 1) + SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + FOREACH(__FLAG ${flags}) + IF(NOT ${type}_FOUND) + SET(CMAKE_REQUIRED_FLAGS ${__FLAG}) + CHECK_C_SOURCE_RUNS("${${type}_CODE}" HAS_${type}_${__FLAG_I}) + IF(HAS_${type}_${__FLAG_I}) + SET(${type}_FOUND TRUE CACHE BOOL "${type} support") + SET(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags") + ENDIF() + MATH(EXPR __FLAG_I "${__FLAG_I}+1") + ENDIF() + ENDFOREACH() + SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + + IF(NOT ${type}_FOUND) + SET(${type}_FOUND FALSE CACHE BOOL "${type} support") + SET(${type}_FLAGS "" CACHE STRING "${type} flags") + ENDIF() + + MARK_AS_ADVANCED(${type}_FOUND ${type}_FLAGS) + +ENDMACRO() + +CHECK_SSE("AVX" " ;-mavx;/arch:AVX") +CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2") +CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512") +CHECK_SSE("FMA" " ;-mfma;") + +IF(${AVX_FOUND}) + set(LLAMA_AVX ON) +ELSE() + set(LLAMA_AVX OFF) +ENDIF() + +IF (${FMA_FOUND}) + set(LLAMA_FMA ON) +ELSE() + set(LLAMA_FMA OFF) +ENDIF() + +IF(${AVX2_FOUND}) + set(LLAMA_AVX2 ON) +ELSE() + set(LLAMA_AVX2 OFF) +ENDIF() + +IF(${AVX512_FOUND}) + set(LLAMA_AVX512 ON) +ELSE() + set(LLAMA_AVX512 OFF) +ENDIF() + # # Compile flags # From 5ad9e9531f218dda251c42dde3850ed6d01f5cbc Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 7 Apr 2023 21:04:47 +0800 Subject: [PATCH 05/14] Only check hardware when option is ON --- CMakeLists.txt | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d68ad183bdf0..7a73d72c49e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,33 +143,32 @@ MACRO(CHECK_SSE type flags) ENDMACRO() -CHECK_SSE("AVX" " ;-mavx;/arch:AVX") -CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2") -CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512") -CHECK_SSE("FMA" " ;-mfma;") - -IF(${AVX_FOUND}) - set(LLAMA_AVX ON) -ELSE() - set(LLAMA_AVX OFF) +IF(${LLAMA_AVX}) + CHECK_SSE("AVX" " ;-mavx;/arch:AVX") + IF(NOT ${AVX_FOUND}) + set(LLAMA_AVX OFF) + ENDIF() ENDIF() -IF (${FMA_FOUND}) - set(LLAMA_FMA ON) -ELSE() - set(LLAMA_FMA OFF) +IF(${LLAMA_AVX2}) + CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2") + IF(NOT ${AVX2_FOUND}) + set(LLAMA_AVX2 OFF) + ENDIF() ENDIF() -IF(${AVX2_FOUND}) - set(LLAMA_AVX2 ON) -ELSE() - set(LLAMA_AVX2 OFF) +IF(${LLAMA_AVX512}) + CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512") + IF(NOT ${AVX512_FOUND}) + set(LLAMA_AVX512 OFF) + ENDIF() ENDIF() -IF(${AVX512_FOUND}) - set(LLAMA_AVX512 ON) -ELSE() - set(LLAMA_AVX512 OFF) +IF(${LLAMA_FMA}) + CHECK_SSE("FMA" " ;-mfma;") + IF (NOT ${FMA_FOUND}) + set(LLAMA_FMA OFF) + ENDIF() ENDIF() # From c640d2a4bd98666a9f182cdca2d5fd5331f3d818 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 7 Apr 2023 22:24:14 +0800 Subject: [PATCH 06/14] Remove finalizer --- ggml.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml.c b/ggml.c index d3d216c611bc8..0808d7ff2bbea 100644 --- a/ggml.c +++ b/ggml.c @@ -9550,7 +9550,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (node->n_tasks > 1) { thpool_wait(ctx->tpool); } - +#if 0 // FINALIZE if (node->n_tasks > 1) { // launch thread pool @@ -9574,7 +9574,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (node->n_tasks > 1) { thpool_wait(ctx->tpool); } - +#endif // performance stats (node) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_node_start_cycles; From 43dde039b05ee8a2c60366a4092856496d3bb4e1 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Fri, 7 Apr 2023 23:51:46 +0800 Subject: [PATCH 07/14] Run second operator when possible --- ggml.c | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 0808d7ff2bbea..945f907cd7cb8 100644 --- a/ggml.c +++ b/ggml.c @@ -2725,9 +2725,9 @@ struct ggml_context_container { // enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, + GGML_TASK_INIT = 1, + GGML_TASK_COMPUTE = 2, + GGML_TASK_FINALIZE = 4, }; struct ggml_compute_params { @@ -9262,9 +9262,20 @@ struct ggml_compute_state { static void ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; + int type = state->params.type; if (state->node) { if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); + if (type & GGML_TASK_INIT) + { + state->params.type = GGML_TASK_INIT; + ggml_compute_forward(&state->params, state->node); + } + + if (type & GGML_TASK_COMPUTE) + { + state->params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(&state->params, state->node); + } } state->node = NULL; } @@ -9527,6 +9538,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) ggml_compute_forward(¶ms, node); + int next_task = 0; + // COMPUTE if (node->n_tasks > 1) { // launch thread pool @@ -9542,12 +9555,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); } } + else + { + int start = i; + int end = i; + if (i + 1 < cgraph->n_nodes) + { + struct ggml_tensor * next = cgraph->nodes[i + 1]; + if (next->src0 != node && next->src1 != node && next->n_tasks == 1) + { + workers[next_task].params = (struct ggml_compute_params) { + .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, + .ith = 0, + .nth = 1, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, + }; + + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); + next_task++; + } + } + } params.type = GGML_TASK_COMPUTE; ggml_compute_forward(¶ms, node); // wait for thread pool - if (node->n_tasks > 1) { + if (node->n_tasks > 1 || next_task != 0) { thpool_wait(ctx->tpool); } #if 0 From 455f6f79bc733a5fb57fc8c3d0e64572b51623f5 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 8 Apr 2023 00:34:05 +0800 Subject: [PATCH 08/14] Try find other single threaded operator to run --- ggml.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/ggml.c b/ggml.c index 945f907cd7cb8..788023d28536e 100644 --- a/ggml.c +++ b/ggml.c @@ -9557,8 +9557,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { - int start = i; - int end = i; if (i + 1 < cgraph->n_nodes) { struct ggml_tensor * next = cgraph->nodes[i + 1]; @@ -9568,14 +9566,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, .ith = 0, .nth = 1, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, + .wsize = 0, + .wdata = NULL, }; - + workers[next_task].node = next; thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); next_task++; + + if (i + 2 < cgraph->n_nodes) + { + struct ggml_tensor * prev = cgraph->nodes[i + 1]; + struct ggml_tensor * next = cgraph->nodes[i + 2]; + if (next->src0 != node && next->src1 != node && next->n_tasks == 1 && + next->src0 != prev && next->src1 != prev + ) + { + workers[next_task].params = (struct ggml_compute_params) { + .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, + .ith = 0, + .nth = 1, + .wsize = 0, + .wdata = NULL, + }; + workers[next_task].node = next; + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); + next_task++; + } + } } } + + } params.type = GGML_TASK_COMPUTE; From 921296c0d588b933320646bf9513a5be13c749ee Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 8 Apr 2023 00:47:19 +0800 Subject: [PATCH 09/14] avoid malloc/free in critial path --- ggml.c | 11 ++++++----- thpool.c | 23 ++--------------------- thpool.h | 8 +++++++- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/ggml.c b/ggml.c index 788023d28536e..93f034d8ff8f7 100644 --- a/ggml.c +++ b/ggml.c @@ -2731,6 +2731,7 @@ enum ggml_task_type { }; struct ggml_compute_params { + job newjob; enum ggml_task_type type; int ith, nth; @@ -9529,11 +9530,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // INIT struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, - /*.ith =*/ 0, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + .type = GGML_TASK_INIT, + .ith = 0, + .nth = node->n_tasks, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, }; ggml_compute_forward(¶ms, node); diff --git a/thpool.c b/thpool.c index 76cf5fc3fc97f..6c9a40e329139 100644 --- a/thpool.c +++ b/thpool.c @@ -156,14 +156,6 @@ typedef struct bsem { } bsem; -/* Job */ -typedef struct job{ - struct job* prev; /* pointer to previous job */ - void (*function)(void* arg); /* function pointer */ - void* arg; /* function's argument */ -} job; - - /* Job queue */ typedef struct jobqueue{ pthread_mutex_t rwmutex; /* used for queue r/w access */ @@ -279,18 +271,8 @@ struct thpool_* thpool_init(int num_threads){ /* Add work to the thread pool */ -int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){ - job* newjob; - - newjob=(struct job*)malloc(sizeof(struct job)); - if (newjob==NULL){ - err("thpool_add_work(): Could not allocate memory for new job\n"); - return -1; - } - - /* add function and argument */ +int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), job * newjob){ newjob->function=function_p; - newjob->arg=arg_p; /* add job to queue */ jobqueue_push(&thpool_p->jobqueue, newjob); @@ -460,9 +442,8 @@ static void* thread_do(struct thread* thread_p){ job* job_p = jobqueue_pull(&thpool_p->jobqueue); if (job_p) { func_buff = job_p->function; - arg_buff = job_p->arg; + arg_buff = job_p; func_buff(arg_buff); - free(job_p); } pthread_mutex_lock(&thpool_p->thcount_lock); diff --git a/thpool.h b/thpool.h index af3e68d165fb0..fafbda5d17220 100644 --- a/thpool.h +++ b/thpool.h @@ -16,6 +16,12 @@ extern "C" { typedef struct thpool_* threadpool; +/* Job */ +typedef struct job{ + struct job* prev; /* pointer to previous job */ + void (*function)(void* arg); /* function pointer */ +} job; + /** * @brief Initialize threadpool @@ -64,7 +70,7 @@ threadpool thpool_init(int num_threads); * @param arg_p pointer to an argument * @return 0 on success, -1 otherwise. */ -int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p); +int thpool_add_work(threadpool, void (*function_p)(void*), job *newjob); /** From 3b03df5c05b0cabbf923d7ee3bf89ceca9a17b12 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 8 Apr 2023 19:55:29 +0800 Subject: [PATCH 10/14] look forward more --- ggml.c | 80 +++++++++++++++++++++++++++------------------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/ggml.c b/ggml.c index 93f034d8ff8f7..d5a190f346858 100644 --- a/ggml.c +++ b/ggml.c @@ -9249,16 +9249,10 @@ typedef int ggml_lock_t; #endif -struct ggml_compute_state_shared { - int n_threads; -}; - struct ggml_compute_state { struct ggml_compute_params params; struct ggml_tensor * node; - - struct ggml_compute_state_shared * shared; }; static void ggml_graph_compute_thread(void * data) { @@ -9284,9 +9278,6 @@ static void ggml_graph_compute_thread(void * data) { void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { const int n_threads = cgraph->n_threads; - struct ggml_compute_state_shared state_shared = { - /*.n_threads =*/ n_threads, - }; struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; // create thread pool @@ -9302,7 +9293,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) .wdata = cgraph->work ? cgraph->work->data : NULL, }, .node = NULL, - .shared = &state_shared, }; } } @@ -9520,6 +9510,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) struct ggml_tensor * node = cgraph->nodes[i]; + if (node->n_tasks == 0) + { + // no work need to be done. + continue; + } // TODO: this could be used to avoid unnecessary computations, but it needs to be improved //if (node->grad == NULL && node->perf_runs > 0) { // continue; @@ -9558,46 +9553,45 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { - if (i + 1 < cgraph->n_nodes) + int start = i; + int end = i + 1; + while (end < cgraph->n_nodes && next_task < n_threads && (end - start) < n_threads * 2) { - struct ggml_tensor * next = cgraph->nodes[i + 1]; - if (next->src0 != node && next->src1 != node && next->n_tasks == 1) + struct ggml_tensor * next = cgraph->nodes[end]; + end++; + + if (next->n_tasks != 1) + continue; + + // check src depedency + bool is_dep = false; + for (int k = start; k < end; k++) { - workers[next_task].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, - .ith = 0, - .nth = 1, - .wsize = 0, - .wdata = NULL, - }; - workers[next_task].node = next; - thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); - next_task++; - - if (i + 2 < cgraph->n_nodes) + struct ggml_tensor * node = cgraph->nodes[k]; + if (next->src0 == node || next->src1 == node) { - struct ggml_tensor * prev = cgraph->nodes[i + 1]; - struct ggml_tensor * next = cgraph->nodes[i + 2]; - if (next->src0 != node && next->src1 != node && next->n_tasks == 1 && - next->src0 != prev && next->src1 != prev - ) - { - workers[next_task].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, - .ith = 0, - .nth = 1, - .wsize = 0, - .wdata = NULL, - }; - workers[next_task].node = next; - thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); - next_task++; - } + is_dep = true; + break; } } - } + if (is_dep) + continue; + workers[next_task].params = (struct ggml_compute_params) { + .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, + .ith = 0, + .nth = 1, + .wsize = 0, + .wdata = NULL, + }; + workers[next_task].node = next; + + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); + next->n_tasks = 0; // indicate this node is caculated + next_task++; + //printf("Combine task [%d, %d]\n", start, end); + } } params.type = GGML_TASK_COMPUTE; From 2035a3cc29223aacf329a7dd6fe10537a9414290 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 9 Apr 2023 22:11:24 +0800 Subject: [PATCH 11/14] avoid to change ggml_task_type --- ggml.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index d5a190f346858..c33ad2ca95ef8 100644 --- a/ggml.c +++ b/ggml.c @@ -2725,9 +2725,9 @@ struct ggml_context_container { // enum ggml_task_type { - GGML_TASK_INIT = 1, - GGML_TASK_COMPUTE = 2, - GGML_TASK_FINALIZE = 4, + GGML_TASK_INIT = 0, + GGML_TASK_COMPUTE, + GGML_TASK_FINALIZE, }; struct ggml_compute_params { @@ -9260,13 +9260,14 @@ static void ggml_graph_compute_thread(void * data) { int type = state->params.type; if (state->node) { if (state->params.ith < state->params.nth) { - if (type & GGML_TASK_INIT) + if (type == GGML_TASK_INIT) { state->params.type = GGML_TASK_INIT; ggml_compute_forward(&state->params, state->node); + type = GGML_TASK_COMPUTE; } - if (type & GGML_TASK_COMPUTE) + if (type == GGML_TASK_COMPUTE) { state->params.type = GGML_TASK_COMPUTE; ggml_compute_forward(&state->params, state->node); @@ -9579,7 +9580,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) continue; workers[next_task].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE | GGML_TASK_INIT, + .type = GGML_TASK_INIT, .ith = 0, .nth = 1, .wsize = 0, From 6f2a61eb4ff433ca413d66b39ce297bb9ba9cbc0 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 10 Apr 2023 22:24:27 +0800 Subject: [PATCH 12/14] Rework scheduling algorithm. --- ggml.c | 119 +++++++++++++++++++++++++++------------------------------ 1 file changed, 56 insertions(+), 63 deletions(-) diff --git a/ggml.c b/ggml.c index c33ad2ca95ef8..cebf43d577584 100644 --- a/ggml.c +++ b/ggml.c @@ -9279,23 +9279,22 @@ static void ggml_graph_compute_thread(void * data) { void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { const int n_threads = cgraph->n_threads; - struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; + const int max_requests = n_threads * 5; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*(max_requests)); // create thread pool - if (n_threads > 1) { - ctx->tpool = thpool_init(n_threads); - for (int j = 0; j < n_threads - 1; j++) { - workers[j] = (struct ggml_compute_state) { - .params = { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = n_threads, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }, - .node = NULL, - }; - } + ctx->tpool = thpool_init(n_threads); + for (int j = 0; j < n_threads - 1; j++) { + workers[j] = (struct ggml_compute_state) { + .params = { + .type = GGML_TASK_COMPUTE, + .ith = j + 1, + .nth = n_threads, + .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, + .wdata = cgraph->work ? cgraph->work->data : NULL, + }, + .node = NULL, + }; } // initialize tasks + work buffer @@ -9505,6 +9504,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); + const size_t wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0; + const void* wdata = cgraph->work ? cgraph->work->data : NULL; for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); @@ -9524,52 +9525,31 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_node_start_cycles = ggml_perf_cycles(); const int64_t perf_node_start_time_us = ggml_perf_time_us(); - // INIT - struct ggml_compute_params params = { - .type = GGML_TASK_INIT, - .ith = 0, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - - ggml_compute_forward(¶ms, node); - int next_task = 0; // COMPUTE - if (node->n_tasks > 1) { - // launch thread pool - for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - workers[j].node = node; - thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]); - } - } - else { int start = i; - int end = i + 1; - while (end < cgraph->n_nodes && next_task < n_threads && (end - start) < n_threads * 2) + int end = i; + while (end < cgraph->n_nodes && (end - start) < n_threads * 2) { struct ggml_tensor * next = cgraph->nodes[end]; end++; - if (next->n_tasks != 1) + // already scheduled + if (next->n_tasks == 0) + continue; + + // if we have slots + if (next_task + next->n_tasks > max_requests) continue; // check src depedency bool is_dep = false; for (int k = start; k < end; k++) { - struct ggml_tensor * node = cgraph->nodes[k]; - if (next->src0 == node || next->src1 == node) + struct ggml_tensor * prev = cgraph->nodes[k]; + if (next->src0 == prev || next->src1 == prev) { is_dep = true; break; @@ -9579,29 +9559,42 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (is_dep) continue; - workers[next_task].params = (struct ggml_compute_params) { - .type = GGML_TASK_INIT, - .ith = 0, - .nth = 1, - .wsize = 0, - .wdata = NULL, - }; - workers[next_task].node = next; + if (next->n_tasks > 1) + { + // run INIT in main thread if it is multi thread operator + struct ggml_compute_params params = { + .type = GGML_TASK_INIT, + .ith = 0, + .nth = next->n_tasks, + .wsize = wsize, + .wdata = wdata, + }; + + ggml_compute_forward(¶ms, next); + } - thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); + for (int j = 0; j < next->n_tasks; j++) { + workers[next_task].params = (struct ggml_compute_params){ + // single thread operator runs INIT in worker thread + .type = next->n_tasks == 1 ? GGML_TASK_INIT : GGML_TASK_COMPUTE, + .ith = j, + .nth = next->n_tasks, + + // TODO: Potential race on wdata + .wsize = wsize, + .wdata = wdata, + }; + workers[next_task].node = next; + + thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]); + next_task++; + } next->n_tasks = 0; // indicate this node is caculated - next_task++; - //printf("Combine task [%d, %d]\n", start, end); } } - params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(¶ms, node); - // wait for thread pool - if (node->n_tasks > 1 || next_task != 0) { - thpool_wait(ctx->tpool); - } + thpool_wait(ctx->tpool); #if 0 // FINALIZE if (node->n_tasks > 1) { From 6d18c6ea3e0097594a67df43bd9bf7dd05c07df3 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 10 Apr 2023 22:37:10 +0800 Subject: [PATCH 13/14] Fix the number of forward looking nodes --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index cebf43d577584..3004e9932ace9 100644 --- a/ggml.c +++ b/ggml.c @@ -9531,7 +9531,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int start = i; int end = i; - while (end < cgraph->n_nodes && (end - start) < n_threads * 2) + while (end < cgraph->n_nodes && (end - start) < 48) { struct ggml_tensor * next = cgraph->nodes[end]; end++; From 94ddd6204ce6e0c0f51adc50e9c3f8211f4038d2 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 10 Apr 2023 22:37:37 +0800 Subject: [PATCH 14/14] Simplify the logic of scheduling --- ggml.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/ggml.c b/ggml.c index 3004e9932ace9..c8a14964576ba 100644 --- a/ggml.c +++ b/ggml.c @@ -9258,22 +9258,17 @@ struct ggml_compute_state { static void ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; int type = state->params.type; - if (state->node) { - if (state->params.ith < state->params.nth) { - if (type == GGML_TASK_INIT) - { - state->params.type = GGML_TASK_INIT; - ggml_compute_forward(&state->params, state->node); - type = GGML_TASK_COMPUTE; - } + if (type == GGML_TASK_INIT) + { + state->params.type = GGML_TASK_INIT; + ggml_compute_forward(&state->params, state->node); + type = GGML_TASK_COMPUTE; + } - if (type == GGML_TASK_COMPUTE) - { - state->params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(&state->params, state->node); - } - } - state->node = NULL; + if (type == GGML_TASK_COMPUTE) + { + state->params.type = GGML_TASK_COMPUTE; + ggml_compute_forward(&state->params, state->node); } } @@ -9284,18 +9279,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // create thread pool ctx->tpool = thpool_init(n_threads); - for (int j = 0; j < n_threads - 1; j++) { - workers[j] = (struct ggml_compute_state) { - .params = { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = n_threads, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }, - .node = NULL, - }; - } // initialize tasks + work buffer {