From 21e88c8b0ff2979f416c5b2e240c65acadcf5fb2 Mon Sep 17 00:00:00 2001
From: Vladimir <bogdad@gmail.com>
Date: Sat, 1 Apr 2023 20:16:36 +0200
Subject: [PATCH 01/14] run sanitizers in release, otherwise too slow (#5)

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 88e70e4959914..482fc64578586 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -326,7 +326,7 @@ jobs:
 #          sudo apt-get install cmake
 #
 #      - name: Configure
-#        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
+#        run: cmake . -DCMAKE_BUILD_TYPE=RelWithDebInfo -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON
 #
 #      - name: Build
 #        run: |

From a65d37ad36f8c5be36ccb8f0d1ddddfd67cab777 Mon Sep 17 00:00:00 2001
From: Vladimir <bogdad@gmail.com>
Date: Fri, 31 Mar 2023 21:03:48 +0200
Subject: [PATCH 02/14] using github Pithikos/C-Thread-Pool for threading

---
 CMakeLists.txt |   5 +-
 Makefile       |  17 +-
 ggml.c         | 234 +++++++++------------
 thpool.c       | 553 +++++++++++++++++++++++++++++++++++++++++++++++++
 thpool.h       | 187 +++++++++++++++++
 5 files changed, 855 insertions(+), 141 deletions(-)
 create mode 100644 thpool.c
 create mode 100644 thpool.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 37f22700bc9aa..e3316f5b0133e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,7 +227,10 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
-            ggml.h)
+            ggml.h
+            thpool.c
+            thpool.h
+            )
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
diff --git a/Makefile b/Makefile
index 83a4514ef7177..dd0c9e45a2365 100644
--- a/Makefile
+++ b/Makefile
@@ -225,6 +225,9 @@ default: main quantize perplexity embedding
 # Build library
 #
 
+thpool.o: thpool.c thpool.h
+	$(CC)  $(CFLAGS) -c thpool.c -o thpool.o
+
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
 
@@ -237,20 +240,20 @@ common.o: examples/common.cpp examples/common.h
 clean:
 	rm -vf *.o main quantize perplexity embedding
 
-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp thpool.o ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp thpool.o ggml.o llama.o common.o -o main $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp thpool.o ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp thpool.o ggml.o llama.o -o quantize $(LDFLAGS)
 
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp thpool.o ggml.o llama.o common.o -o perplexity $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp thpool.o ggml.o llama.o common.o -o embedding $(LDFLAGS)
 
 #
 # Tests
diff --git a/ggml.c b/ggml.c
index 25fa726320df2..58e51159c195f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3,6 +3,8 @@
 
 #include "ggml.h"
 
+#include "thpool.h"
+
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
 #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
@@ -72,6 +74,59 @@ static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
+
+typedef struct pthread_mutex_tag {
+    CRITICAL_SECTION critical_section;
+} pthread_mutex_t;
+
+typedef struct pthread_mutexattr_tag {
+    int attr;
+} pthread_mutexattr_t;
+
+int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) {
+    InitializeCriticalSection (&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t * mutex) {
+    DeleteCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+
+int pthread_mutex_lock(pthread_mutex_t * mutex) {
+    EnterCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t * mutex) {
+    LeaveCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+typedef struct pthread_cond_tag {
+    CONDITION_VARIABLE cond;
+} pthread_cond_t;
+
+int pthread_cond_init(pthread_cond_t * cond, void * unused) {
+    InitializeConditionVariable (&cond->cond);
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t * cond) {
+    return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) {
+    SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE);
+    return 0;
+}
+
+int pthread_cond_broadcast(pthread_cond_t * cond) {
+    WakeAllConditionVariable(&cond->cond);
+    return 0;
+}
+
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -2538,6 +2593,7 @@ struct ggml_context {
 
     struct ggml_scratch scratch;
     struct ggml_scratch scratch_save;
+    threadpool tpool;
 };
 
 struct ggml_context_container {
@@ -2822,6 +2878,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         /*.objects_end        =*/ NULL,
         /*.scratch            =*/ { 0, 0, NULL, },
         /*.scratch_save       =*/ { 0, 0, NULL, },
+        /*.thpool             =*/ NULL,
     };
 
     GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
@@ -8954,6 +9011,19 @@ typedef pthread_t ggml_thread_t;
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 
+typedef pthread_mutex_t ggml_mutex_t;
+typedef pthread_cond_t ggml_cond_t;
+
+#define ggml_mutex_init       pthread_mutex_init
+#define ggml_mutex_destroy    pthread_mutex_destroy
+#define ggml_cond_init        pthread_cond_init
+#define ggml_cond_destroy     pthread_cond_destroy
+
+#define ggml_mutex_lock       pthread_mutex_lock
+#define ggml_mutex_unlock     pthread_mutex_unlock
+#define ggml_cond_broadcast   pthread_cond_broadcast
+#define ggml_cond_wait        pthread_cond_wait
+
 #else
 
 //typedef pthread_spinlock_t ggml_lock_t;
@@ -8977,17 +9047,31 @@ typedef pthread_t ggml_thread_t;
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 
+typedef pthread_mutex_t ggml_mutex_t;
+typedef pthread_cond_t ggml_cond_t;
+
+#define ggml_mutex_init       pthread_mutex_init
+#define ggml_mutex_destroy    pthread_mutex_destroy
+#define ggml_cond_init        pthread_cond_init
+#define ggml_cond_destroy     pthread_cond_destroy
+
+#define ggml_mutex_lock       pthread_mutex_lock
+#define ggml_mutex_unlock     pthread_mutex_unlock
+#define ggml_cond_broadcast   pthread_cond_broadcast
+#define ggml_cond_wait        pthread_cond_wait
+
 #endif
 
 struct ggml_compute_state_shared {
-    ggml_lock_t spin;
 
     int n_threads;
 
     // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
+    int  n_ready;
+    bool has_work;
+    bool stop; // stop all threads
+    ggml_mutex_t mutex;
+    ggml_cond_t cond;
 };
 
 struct ggml_compute_state {
@@ -8999,72 +9083,31 @@ struct ggml_compute_state {
     struct ggml_compute_state_shared * shared;
 };
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
+static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const int n_threads = state->shared->n_threads;
-
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
-            }
-        }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
-            }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
-        }
-
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
-        } else {
-            break;
+    if (state->node) {
+        if (state->params.ith < state->params.nth) {
+            ggml_compute_forward(&state->params, state->node);
         }
+        state->node = NULL;
     }
-
-    return 0;
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     const int n_threads = cgraph->n_threads;
-
     struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
         /*.n_threads =*/ n_threads,
         /*.n_ready   =*/ 0,
         /*.has_work  =*/ false,
         /*.stop      =*/ false,
+        /*.mutex     =*/ {0},
+        /*.cond      =*/ {0},
     };
     struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
 
     // create thread pool
     if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
+        ctx->tpool = thpool_init(n_threads);
         for (int j = 0; j < n_threads - 1; j++) {
             workers[j] = (struct ggml_compute_state) {
                 .thrd   = 0,
@@ -9078,10 +9121,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 .node   = NULL,
                 .shared = &state_shared,
             };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
         }
     }
 
@@ -9319,15 +9358,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // COMPUTE
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
             // launch thread pool
             for (int j = 0; j < n_threads - 1; j++) {
                 workers[j].params = (struct ggml_compute_params) {
@@ -9338,16 +9368,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     .wdata = cgraph->work ? cgraph->work->data : NULL,
                 };
                 workers[j].node = node;
+                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
             }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
         }
 
         params.type = GGML_TASK_COMPUTE;
@@ -9355,34 +9377,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // wait for thread pool
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
+            thpool_wait(ctx->tpool);
         }
 
         // FINALIZE
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
             // launch thread pool
             for (int j = 0; j < n_threads - 1; j++) {
                 workers[j].params = (struct ggml_compute_params) {
@@ -9393,16 +9392,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     .wdata = cgraph->work ? cgraph->work->data : NULL,
                 };
                 workers[j].node = node;
+                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
             }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
         }
 
         params.type = GGML_TASK_FINALIZE;
@@ -9410,21 +9401,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // wait for thread pool
         if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
+            thpool_wait(ctx->tpool);
         }
 
         // performance stats (node)
@@ -9440,16 +9417,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     // join thread pool
     if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
+        thpool_destroy(ctx->tpool);
     }
 
     // performance stats (graph)
diff --git a/thpool.c b/thpool.c
new file mode 100644
index 0000000000000..59e2e5556780d
--- /dev/null
+++ b/thpool.c
@@ -0,0 +1,553 @@
+/* ********************************
+ * Author:       Johan Hanssen Seferidis
+ * License:	     MIT
+ * Description:  Library providing a threading pool where you can add
+ *               work. For usage, check the thpool.h file or README.md
+ *
+ *//** @file thpool.h *//*
+ *
+ ********************************/
+
+#if defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#else
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+#endif
+#include <unistd.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <errno.h>
+#include <time.h>
+#if defined(__linux__)
+#include <sys/prctl.h>
+#endif
+
+#include "thpool.h"
+
+#ifdef THPOOL_DEBUG
+#define THPOOL_DEBUG 1
+#else
+#define THPOOL_DEBUG 0
+#endif
+
+#if !defined(DISABLE_PRINT) || defined(THPOOL_DEBUG)
+#define err(str) fprintf(stderr, str)
+#else
+#define err(str)
+#endif
+
+static volatile int threads_keepalive;
+static volatile int threads_on_hold;
+
+
+
+/* ========================== STRUCTURES ============================ */
+
+
+/* Binary semaphore */
+typedef struct bsem {
+	pthread_mutex_t mutex;
+	pthread_cond_t   cond;
+	int v;
+} bsem;
+
+
+/* Job */
+typedef struct job{
+	struct job*  prev;                   /* pointer to previous job   */
+	void   (*function)(void* arg);       /* function pointer          */
+	void*  arg;                          /* function's argument       */
+} job;
+
+
+/* Job queue */
+typedef struct jobqueue{
+	pthread_mutex_t rwmutex;             /* used for queue r/w access */
+	job  *front;                         /* pointer to front of queue */
+	job  *rear;                          /* pointer to rear  of queue */
+	bsem *has_jobs;                      /* flag as binary semaphore  */
+	int   len;                           /* number of jobs in queue   */
+} jobqueue;
+
+
+/* Thread */
+typedef struct thread{
+	int       id;                        /* friendly id               */
+	pthread_t pthread;                   /* pointer to actual thread  */
+	struct thpool_* thpool_p;            /* access to thpool          */
+} thread;
+
+
+/* Threadpool */
+typedef struct thpool_{
+	thread**   threads;                  /* pointer to threads        */
+	volatile int num_threads_alive;      /* threads currently alive   */
+	volatile int num_threads_working;    /* threads currently working */
+	pthread_mutex_t  thcount_lock;       /* used for thread count etc */
+	pthread_cond_t  threads_all_idle;    /* signal to thpool_wait     */
+	jobqueue  jobqueue;                  /* job queue                 */
+} thpool_;
+
+
+
+
+
+/* ========================== PROTOTYPES ============================ */
+
+
+static int  thread_init(thpool_* thpool_p, struct thread** thread_p, int id);
+static void* thread_do(struct thread* thread_p);
+static void  thread_hold(int sig_id);
+static void  thread_destroy(struct thread* thread_p);
+
+static int   jobqueue_init(jobqueue* jobqueue_p);
+static void  jobqueue_clear(jobqueue* jobqueue_p);
+static void  jobqueue_push(jobqueue* jobqueue_p, struct job* newjob_p);
+static struct job* jobqueue_pull(jobqueue* jobqueue_p);
+static void  jobqueue_destroy(jobqueue* jobqueue_p);
+
+static void  bsem_init(struct bsem *bsem_p, int value);
+static void  bsem_reset(struct bsem *bsem_p);
+static void  bsem_post(struct bsem *bsem_p);
+static void  bsem_post_all(struct bsem *bsem_p);
+static void  bsem_wait(struct bsem *bsem_p);
+
+
+
+
+
+/* ========================== THREADPOOL ============================ */
+
+
+/* Initialise thread pool */
+struct thpool_* thpool_init(int num_threads){
+
+	threads_on_hold   = 0;
+	threads_keepalive = 1;
+
+	if (num_threads < 0){
+		num_threads = 0;
+	}
+
+	/* Make new thread pool */
+	thpool_* thpool_p;
+	thpool_p = (struct thpool_*)malloc(sizeof(struct thpool_));
+	if (thpool_p == NULL){
+		err("thpool_init(): Could not allocate memory for thread pool\n");
+		return NULL;
+	}
+	thpool_p->num_threads_alive   = 0;
+	thpool_p->num_threads_working = 0;
+
+	/* Initialise the job queue */
+	if (jobqueue_init(&thpool_p->jobqueue) == -1){
+		err("thpool_init(): Could not allocate memory for job queue\n");
+		free(thpool_p);
+		return NULL;
+	}
+
+	/* Make threads in pool */
+	thpool_p->threads = (struct thread**)malloc(num_threads * sizeof(struct thread *));
+	if (thpool_p->threads == NULL){
+		err("thpool_init(): Could not allocate memory for threads\n");
+		jobqueue_destroy(&thpool_p->jobqueue);
+		free(thpool_p);
+		return NULL;
+	}
+
+	pthread_mutex_init(&(thpool_p->thcount_lock), NULL);
+	pthread_cond_init(&thpool_p->threads_all_idle, NULL);
+
+	/* Thread init */
+	int n;
+	for (n=0; n<num_threads; n++){
+		thread_init(thpool_p, &thpool_p->threads[n], n);
+#if THPOOL_DEBUG
+			printf("THPOOL_DEBUG: Created thread %d in pool \n", n);
+#endif
+	}
+
+	/* Wait for threads to initialize */
+	while (thpool_p->num_threads_alive != num_threads) {}
+
+	return thpool_p;
+}
+
+
+/* Add work to the thread pool */
+int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){
+	job* newjob;
+
+	newjob=(struct job*)malloc(sizeof(struct job));
+	if (newjob==NULL){
+		err("thpool_add_work(): Could not allocate memory for new job\n");
+		return -1;
+	}
+
+	/* add function and argument */
+	newjob->function=function_p;
+	newjob->arg=arg_p;
+
+	/* add job to queue */
+	jobqueue_push(&thpool_p->jobqueue, newjob);
+
+	return 0;
+}
+
+
+/* Wait until all jobs have finished */
+void thpool_wait(thpool_* thpool_p){
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	while (thpool_p->jobqueue.len || thpool_p->num_threads_working) {
+		pthread_cond_wait(&thpool_p->threads_all_idle, &thpool_p->thcount_lock);
+	}
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+}
+
+
+/* Destroy the threadpool */
+void thpool_destroy(thpool_* thpool_p){
+	/* No need to destroy if it's NULL */
+	if (thpool_p == NULL) return ;
+
+	volatile int threads_total = thpool_p->num_threads_alive;
+
+	/* End each thread 's infinite loop */
+	threads_keepalive = 0;
+
+	/* Give one second to kill idle threads */
+	double TIMEOUT = 1.0;
+	time_t start, end;
+	double tpassed = 0.0;
+	time (&start);
+	while (tpassed < TIMEOUT && thpool_p->num_threads_alive){
+		bsem_post_all(thpool_p->jobqueue.has_jobs);
+		time (&end);
+		tpassed = difftime(end,start);
+	}
+
+	/* Poll remaining threads */
+	while (thpool_p->num_threads_alive){
+		bsem_post_all(thpool_p->jobqueue.has_jobs);
+		sleep(1);
+	}
+
+	/* Job queue cleanup */
+	jobqueue_destroy(&thpool_p->jobqueue);
+	/* Deallocs */
+	int n;
+	for (n=0; n < threads_total; n++){
+		thread_destroy(thpool_p->threads[n]);
+	}
+	free(thpool_p->threads);
+	free(thpool_p);
+}
+
+
+/* Pause all threads in threadpool */
+void thpool_pause(thpool_* thpool_p) {
+	int n;
+	for (n=0; n < thpool_p->num_threads_alive; n++){
+		pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1);
+	}
+}
+
+
+/* Resume all threads in threadpool */
+void thpool_resume(thpool_* thpool_p) {
+    // resuming a single threadpool hasn't been
+    // implemented yet, meanwhile this suppresses
+    // the warnings
+    (void)thpool_p;
+
+	threads_on_hold = 0;
+}
+
+
+int thpool_num_threads_working(thpool_* thpool_p){
+	return thpool_p->num_threads_working;
+}
+
+
+
+
+
+/* ============================ THREAD ============================== */
+
+
+/* Initialize a thread in the thread pool
+ *
+ * @param thread        address to the pointer of the thread to be created
+ * @param id            id to be given to the thread
+ * @return 0 on success, -1 otherwise.
+ */
+static int thread_init (thpool_* thpool_p, struct thread** thread_p, int id){
+
+	*thread_p = (struct thread*)malloc(sizeof(struct thread));
+	if (*thread_p == NULL){
+		err("thread_init(): Could not allocate memory for thread\n");
+		return -1;
+	}
+
+	(*thread_p)->thpool_p = thpool_p;
+	(*thread_p)->id       = id;
+
+	pthread_create(&(*thread_p)->pthread, NULL, (void * (*)(void *)) thread_do, (*thread_p));
+	pthread_detach((*thread_p)->pthread);
+	return 0;
+}
+
+
+/* Sets the calling thread on hold */
+static void thread_hold(int sig_id) {
+    (void)sig_id;
+	threads_on_hold = 1;
+	while (threads_on_hold){
+		sleep(1);
+	}
+}
+
+
+/* What each thread is doing
+*
+* In principle this is an endless loop. The only time this loop gets interuppted is once
+* thpool_destroy() is invoked or the program exits.
+*
+* @param  thread        thread that will run this function
+* @return nothing
+*/
+static void* thread_do(struct thread* thread_p){
+
+	/* Set thread name for profiling and debugging */
+	char thread_name[16] = {0};
+	snprintf(thread_name, 16, "thpool-%d", thread_p->id);
+
+#if defined(__linux__)
+	/* Use prctl instead to prevent using _GNU_SOURCE flag and implicit declaration */
+	prctl(PR_SET_NAME, thread_name);
+#elif defined(__APPLE__) && defined(__MACH__)
+	pthread_setname_np(thread_name);
+#else
+	err("thread_do(): pthread_setname_np is not supported on this system");
+#endif
+
+	/* Assure all threads have been created before starting serving */
+	thpool_* thpool_p = thread_p->thpool_p;
+
+	/* Register signal handler */
+	struct sigaction act;
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	act.sa_handler = thread_hold;
+	if (sigaction(SIGUSR1, &act, NULL) == -1) {
+		err("thread_do(): cannot handle SIGUSR1");
+	}
+
+	/* Mark thread as alive (initialized) */
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	thpool_p->num_threads_alive += 1;
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+	while(threads_keepalive){
+
+		bsem_wait(thpool_p->jobqueue.has_jobs);
+
+		if (threads_keepalive){
+
+			pthread_mutex_lock(&thpool_p->thcount_lock);
+			thpool_p->num_threads_working++;
+			pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+			/* Read job from queue and execute it */
+			void (*func_buff)(void*);
+			void*  arg_buff;
+			job* job_p = jobqueue_pull(&thpool_p->jobqueue);
+			if (job_p) {
+				func_buff = job_p->function;
+				arg_buff  = job_p->arg;
+				func_buff(arg_buff);
+				free(job_p);
+			}
+
+			pthread_mutex_lock(&thpool_p->thcount_lock);
+			thpool_p->num_threads_working--;
+			if (!thpool_p->num_threads_working) {
+				pthread_cond_signal(&thpool_p->threads_all_idle);
+			}
+			pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+		}
+	}
+	pthread_mutex_lock(&thpool_p->thcount_lock);
+	thpool_p->num_threads_alive --;
+	pthread_mutex_unlock(&thpool_p->thcount_lock);
+
+	return NULL;
+}
+
+
+/* Frees a thread  */
+static void thread_destroy (thread* thread_p){
+	free(thread_p);
+}
+
+
+
+
+
+/* ============================ JOB QUEUE =========================== */
+
+
+/* Initialize queue */
+static int jobqueue_init(jobqueue* jobqueue_p){
+	jobqueue_p->len = 0;
+	jobqueue_p->front = NULL;
+	jobqueue_p->rear  = NULL;
+
+	jobqueue_p->has_jobs = (struct bsem*)malloc(sizeof(struct bsem));
+	if (jobqueue_p->has_jobs == NULL){
+		return -1;
+	}
+
+	pthread_mutex_init(&(jobqueue_p->rwmutex), NULL);
+	bsem_init(jobqueue_p->has_jobs, 0);
+
+	return 0;
+}
+
+
+/* Clear the queue */
+static void jobqueue_clear(jobqueue* jobqueue_p){
+
+	while(jobqueue_p->len){
+		free(jobqueue_pull(jobqueue_p));
+	}
+
+	jobqueue_p->front = NULL;
+	jobqueue_p->rear  = NULL;
+	bsem_reset(jobqueue_p->has_jobs);
+	jobqueue_p->len = 0;
+
+}
+
+
+/* Add (allocated) job to queue
+ */
+static void jobqueue_push(jobqueue* jobqueue_p, struct job* newjob){
+
+	pthread_mutex_lock(&jobqueue_p->rwmutex);
+	newjob->prev = NULL;
+
+	switch(jobqueue_p->len){
+
+		case 0:  /* if no jobs in queue */
+					jobqueue_p->front = newjob;
+					jobqueue_p->rear  = newjob;
+					break;
+
+		default: /* if jobs in queue */
+					jobqueue_p->rear->prev = newjob;
+					jobqueue_p->rear = newjob;
+
+	}
+	jobqueue_p->len++;
+
+	bsem_post(jobqueue_p->has_jobs);
+	pthread_mutex_unlock(&jobqueue_p->rwmutex);
+}
+
+
+/* Get first job from queue(removes it from queue)
+ * Notice: Caller MUST hold a mutex
+ */
+static struct job* jobqueue_pull(jobqueue* jobqueue_p){
+
+	pthread_mutex_lock(&jobqueue_p->rwmutex);
+	job* job_p = jobqueue_p->front;
+
+	switch(jobqueue_p->len){
+
+		case 0:  /* if no jobs in queue */
+		  			break;
+
+		case 1:  /* if one job in queue */
+					jobqueue_p->front = NULL;
+					jobqueue_p->rear  = NULL;
+					jobqueue_p->len = 0;
+					break;
+
+		default: /* if >1 jobs in queue */
+					jobqueue_p->front = job_p->prev;
+					jobqueue_p->len--;
+					/* more than one job in queue -> post it */
+					bsem_post(jobqueue_p->has_jobs);
+
+	}
+
+	pthread_mutex_unlock(&jobqueue_p->rwmutex);
+	return job_p;
+}
+
+
+/* Free all queue resources back to the system */
+static void jobqueue_destroy(jobqueue* jobqueue_p){
+	jobqueue_clear(jobqueue_p);
+	free(jobqueue_p->has_jobs);
+}
+
+
+
+
+
+/* ======================== SYNCHRONISATION ========================= */
+
+
+/* Init semaphore to 1 or 0 */
+static void bsem_init(bsem *bsem_p, int value) {
+	if (value < 0 || value > 1) {
+		err("bsem_init(): Binary semaphore can take only values 1 or 0");
+		exit(1);
+	}
+	pthread_mutex_init(&(bsem_p->mutex), NULL);
+	pthread_cond_init(&(bsem_p->cond), NULL);
+	bsem_p->v = value;
+}
+
+
+/* Reset semaphore to 0 */
+static void bsem_reset(bsem *bsem_p) {
+	bsem_init(bsem_p, 0);
+}
+
+
+/* Post to at least one thread */
+static void bsem_post(bsem *bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	bsem_p->v = 1;
+	pthread_cond_signal(&bsem_p->cond);
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
+
+
+/* Post to all threads */
+static void bsem_post_all(bsem *bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	bsem_p->v = 1;
+	pthread_cond_broadcast(&bsem_p->cond);
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
+
+
+/* Wait on semaphore until semaphore has value 0 */
+static void bsem_wait(bsem* bsem_p) {
+	pthread_mutex_lock(&bsem_p->mutex);
+	while (bsem_p->v != 1) {
+		pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex);
+	}
+	bsem_p->v = 0;
+	pthread_mutex_unlock(&bsem_p->mutex);
+}
diff --git a/thpool.h b/thpool.h
new file mode 100644
index 0000000000000..af3e68d165fb0
--- /dev/null
+++ b/thpool.h
@@ -0,0 +1,187 @@
+/**********************************
+ * @author      Johan Hanssen Seferidis
+ * License:     MIT
+ *
+ **********************************/
+
+#ifndef _THPOOL_
+#define _THPOOL_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* =================================== API ======================================= */
+
+
+typedef struct thpool_* threadpool;
+
+
+/**
+ * @brief  Initialize threadpool
+ *
+ * Initializes a threadpool. This function will not return until all
+ * threads have initialized successfully.
+ *
+ * @example
+ *
+ *    ..
+ *    threadpool thpool;                     //First we declare a threadpool
+ *    thpool = thpool_init(4);               //then we initialize it to 4 threads
+ *    ..
+ *
+ * @param  num_threads   number of threads to be created in the threadpool
+ * @return threadpool    created threadpool on success,
+ *                       NULL on error
+ */
+threadpool thpool_init(int num_threads);
+
+
+/**
+ * @brief Add work to the job queue
+ *
+ * Takes an action and its argument and adds it to the threadpool's job queue.
+ * If you want to add to work a function with more than one arguments then
+ * a way to implement this is by passing a pointer to a structure.
+ *
+ * NOTICE: You have to cast both the function and argument to not get warnings.
+ *
+ * @example
+ *
+ *    void print_num(int num){
+ *       printf("%d\n", num);
+ *    }
+ *
+ *    int main() {
+ *       ..
+ *       int a = 10;
+ *       thpool_add_work(thpool, (void*)print_num, (void*)a);
+ *       ..
+ *    }
+ *
+ * @param  threadpool    threadpool to which the work will be added
+ * @param  function_p    pointer to function to add as work
+ * @param  arg_p         pointer to an argument
+ * @return 0 on success, -1 otherwise.
+ */
+int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p);
+
+
+/**
+ * @brief Wait for all queued jobs to finish
+ *
+ * Will wait for all jobs - both queued and currently running to finish.
+ * Once the queue is empty and all work has completed, the calling thread
+ * (probably the main program) will continue.
+ *
+ * Smart polling is used in wait. The polling is initially 0 - meaning that
+ * there is virtually no polling at all. If after 1 seconds the threads
+ * haven't finished, the polling interval starts growing exponentially
+ * until it reaches max_secs seconds. Then it jumps down to a maximum polling
+ * interval assuming that heavy processing is being used in the threadpool.
+ *
+ * @example
+ *
+ *    ..
+ *    threadpool thpool = thpool_init(4);
+ *    ..
+ *    // Add a bunch of work
+ *    ..
+ *    thpool_wait(thpool);
+ *    puts("All added work has finished");
+ *    ..
+ *
+ * @param threadpool     the threadpool to wait for
+ * @return nothing
+ */
+void thpool_wait(threadpool);
+
+
+/**
+ * @brief Pauses all threads immediately
+ *
+ * The threads will be paused no matter if they are idle or working.
+ * The threads return to their previous states once thpool_resume
+ * is called.
+ *
+ * While the thread is being paused, new work can be added.
+ *
+ * @example
+ *
+ *    threadpool thpool = thpool_init(4);
+ *    thpool_pause(thpool);
+ *    ..
+ *    // Add a bunch of work
+ *    ..
+ *    thpool_resume(thpool); // Let the threads start their magic
+ *
+ * @param threadpool    the threadpool where the threads should be paused
+ * @return nothing
+ */
+void thpool_pause(threadpool);
+
+
+/**
+ * @brief Unpauses all threads if they are paused
+ *
+ * @example
+ *    ..
+ *    thpool_pause(thpool);
+ *    sleep(10);              // Delay execution 10 seconds
+ *    thpool_resume(thpool);
+ *    ..
+ *
+ * @param threadpool     the threadpool where the threads should be unpaused
+ * @return nothing
+ */
+void thpool_resume(threadpool);
+
+
+/**
+ * @brief Destroy the threadpool
+ *
+ * This will wait for the currently active threads to finish and then 'kill'
+ * the whole threadpool to free up memory.
+ *
+ * @example
+ * int main() {
+ *    threadpool thpool1 = thpool_init(2);
+ *    threadpool thpool2 = thpool_init(2);
+ *    ..
+ *    thpool_destroy(thpool1);
+ *    ..
+ *    return 0;
+ * }
+ *
+ * @param threadpool     the threadpool to destroy
+ * @return nothing
+ */
+void thpool_destroy(threadpool);
+
+
+/**
+ * @brief Show currently working threads
+ *
+ * Working threads are the threads that are performing work (not idle).
+ *
+ * @example
+ * int main() {
+ *    threadpool thpool1 = thpool_init(2);
+ *    threadpool thpool2 = thpool_init(2);
+ *    ..
+ *    printf("Working threads: %d\n", thpool_num_threads_working(thpool1));
+ *    ..
+ *    return 0;
+ * }
+ *
+ * @param threadpool     the threadpool of interest
+ * @return integer       number of threads working
+ */
+int thpool_num_threads_working(threadpool);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif

From d3bc4df97db470d4e719bab387bb546bc5dabc70 Mon Sep 17 00:00:00 2001
From: Vladimir <bogdad@gmail.com>
Date: Sat, 1 Apr 2023 19:45:33 +0200
Subject: [PATCH 03/14] fix windows build

---
 ggml.c   |  89 -----------------------------------------
 thpool.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 105 insertions(+), 102 deletions(-)

diff --git a/ggml.c b/ggml.c
index 58e51159c195f..1c3e28d37c4ec 100644
--- a/ggml.c
+++ b/ggml.c
@@ -52,81 +52,11 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void* unused) {
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
 }
 
-typedef struct pthread_mutex_tag {
-    CRITICAL_SECTION critical_section;
-} pthread_mutex_t;
-
-typedef struct pthread_mutexattr_tag {
-    int attr;
-} pthread_mutexattr_t;
-
-int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) {
-    InitializeCriticalSection (&mutex->critical_section);
-    return 0;
-}
-
-int pthread_mutex_destroy(pthread_mutex_t * mutex) {
-    DeleteCriticalSection(&mutex->critical_section);
-    return 0;
-}
-
-
-int pthread_mutex_lock(pthread_mutex_t * mutex) {
-    EnterCriticalSection(&mutex->critical_section);
-    return 0;
-}
-
-int pthread_mutex_unlock(pthread_mutex_t * mutex) {
-    LeaveCriticalSection(&mutex->critical_section);
-    return 0;
-}
-
-typedef struct pthread_cond_tag {
-    CONDITION_VARIABLE cond;
-} pthread_cond_t;
-
-int pthread_cond_init(pthread_cond_t * cond, void * unused) {
-    InitializeConditionVariable (&cond->cond);
-    return 0;
-}
-
-int pthread_cond_destroy(pthread_cond_t * cond) {
-    return 0;
-}
-
-int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) {
-    SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE);
-    return 0;
-}
-
-int pthread_cond_broadcast(pthread_cond_t * cond) {
-    WakeAllConditionVariable(&cond->cond);
-    return 0;
-}
-
 #else
 #include <pthread.h>
 #include <stdatomic.h>
@@ -9042,14 +8972,10 @@ typedef int ggml_lock_t;
 
 #define GGML_LOCK_INITIALIZER 0
 
-typedef pthread_t ggml_thread_t;
 
 #define ggml_thread_create pthread_create
 #define ggml_thread_join   pthread_join
 
-typedef pthread_mutex_t ggml_mutex_t;
-typedef pthread_cond_t ggml_cond_t;
-
 #define ggml_mutex_init       pthread_mutex_init
 #define ggml_mutex_destroy    pthread_mutex_destroy
 #define ggml_cond_init        pthread_cond_init
@@ -9063,19 +8989,10 @@ typedef pthread_cond_t ggml_cond_t;
 #endif
 
 struct ggml_compute_state_shared {
-
     int n_threads;
-
-    // synchronization primitives
-    int  n_ready;
-    bool has_work;
-    bool stop; // stop all threads
-    ggml_mutex_t mutex;
-    ggml_cond_t cond;
 };
 
 struct ggml_compute_state {
-    ggml_thread_t thrd;
 
     struct ggml_compute_params params;
     struct ggml_tensor * node;
@@ -9097,11 +9014,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int n_threads = cgraph->n_threads;
     struct ggml_compute_state_shared state_shared = {
         /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
-        /*.mutex     =*/ {0},
-        /*.cond      =*/ {0},
     };
     struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
 
@@ -9110,7 +9022,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         ctx->tpool = thpool_init(n_threads);
         for (int j = 0; j < n_threads - 1; j++) {
             workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
                 .params = {
                     .type  = GGML_TASK_COMPUTE,
                     .ith   = j + 1,
diff --git a/thpool.c b/thpool.c
index 59e2e5556780d..76cf5fc3fc97f 100644
--- a/thpool.c
+++ b/thpool.c
@@ -15,11 +15,111 @@
 #define _POSIX_C_SOURCE 200809L
 #endif
 #endif
+/*
+this is not part of original thpool, thats me hacking it to work on windows
+*/
+#if defined _MSC_VER || defined(__MINGW32__)
+#if !defined(__MINGW32__)
+#include <Windows.h>
+#else
+// ref: https://github.com/ggerganov/whisper.cpp/issues/168
+#include <windows.h>
+#endif
+
+unsigned int sleep(unsigned int seconds) {
+	Sleep(seconds * 1000);
+	return 0;
+}
+
+typedef HANDLE pthread_t;
+
+typedef DWORD thread_ret_t;
+static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
+    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
+    if (handle == NULL)
+    {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void* unused) {
+    return (int) WaitForSingleObject(thread, INFINITE);
+}
+
+static int pthread_detach(pthread_t thread) {
+	CloseHandle(thread);
+	return 0;
+}
+
+typedef struct pthread_mutex_tag {
+    CRITICAL_SECTION critical_section;
+} pthread_mutex_t;
+
+typedef struct pthread_mutexattr_tag {
+    int attr;
+} pthread_mutexattr_t;
+
+int pthread_mutex_init(pthread_mutex_t * mutex, const pthread_mutexattr_t * attr) {
+    InitializeCriticalSection (&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t * mutex) {
+    DeleteCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+
+int pthread_mutex_lock(pthread_mutex_t * mutex) {
+    EnterCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t * mutex) {
+    LeaveCriticalSection(&mutex->critical_section);
+    return 0;
+}
+
+typedef struct pthread_cond_tag {
+    CONDITION_VARIABLE cond;
+} pthread_cond_t;
+
+int pthread_cond_init(pthread_cond_t * cond, void * unused) {
+    InitializeConditionVariable (&cond->cond);
+    return 0;
+}
+
+int pthread_cond_destroy(pthread_cond_t * cond) {
+    return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex) {
+    SleepConditionVariableCS(&cond->cond, &mutex->critical_section, INFINITE);
+    return 0;
+}
+
+int pthread_cond_broadcast(pthread_cond_t * cond) {
+    WakeAllConditionVariable(&cond->cond);
+    return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t * cond) {
+    WakeConditionVariable(&cond->cond);
+    return 0;
+}
+#else
 #include <unistd.h>
+#include <pthread.h>
+#endif
+
 #include <signal.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <pthread.h>
+
+
 #include <errno.h>
 #include <time.h>
 #if defined(__linux__)
@@ -247,16 +347,6 @@ void thpool_destroy(thpool_* thpool_p){
 	free(thpool_p);
 }
 
-
-/* Pause all threads in threadpool */
-void thpool_pause(thpool_* thpool_p) {
-	int n;
-	for (n=0; n < thpool_p->num_threads_alive; n++){
-		pthread_kill(thpool_p->threads[n]->pthread, SIGUSR1);
-	}
-}
-
-
 /* Resume all threads in threadpool */
 void thpool_resume(thpool_* thpool_p) {
     // resuming a single threadpool hasn't been
@@ -332,20 +422,22 @@ static void* thread_do(struct thread* thread_p){
 #elif defined(__APPLE__) && defined(__MACH__)
 	pthread_setname_np(thread_name);
 #else
-	err("thread_do(): pthread_setname_np is not supported on this system");
+	// err("thread_do(): pthread_setname_np is not supported on this system");
 #endif
 
 	/* Assure all threads have been created before starting serving */
 	thpool_* thpool_p = thread_p->thpool_p;
 
 	/* Register signal handler */
+	/*
+	///// HACK
 	struct sigaction act;
 	sigemptyset(&act.sa_mask);
 	act.sa_flags = 0;
 	act.sa_handler = thread_hold;
 	if (sigaction(SIGUSR1, &act, NULL) == -1) {
 		err("thread_do(): cannot handle SIGUSR1");
-	}
+	}*/
 
 	/* Mark thread as alive (initialized) */
 	pthread_mutex_lock(&thpool_p->thcount_lock);

From 997c749065bad8f93bfcc92e8220f0c8acbba633 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 1 Apr 2023 16:32:14 +0800
Subject: [PATCH 04/14] Add detection code for avx

---
 CMakeLists.txt | 108 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a434f07bba0c..8d68ad183bdf0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,114 @@ option(LLAMA_OPENBLAS               "llama: use OpenBLAS"
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 
+INCLUDE(CheckCSourceRuns)
+
+SET(AVX_CODE "
+  #include <immintrin.h>
+  int main()
+  {
+    __m256 a;
+    a = _mm256_set1_ps(0);
+    return 0;
+  }
+")
+
+SET(AVX512_CODE "
+  #include <immintrin.h>
+  int main()
+  {
+    __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i b = a;
+    __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+    return 0;
+  }
+")
+
+SET(AVX2_CODE "
+  #include <immintrin.h>
+  int main()
+  {
+    __m256i a = {0};
+    a = _mm256_abs_epi16(a);
+    __m256i x;
+    _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+    return 0;
+  }
+")
+
+SET(FMA_CODE "
+  #include <immintrin.h>
+  int main()
+  {
+    __m256 acc = _mm256_setzero_ps();
+    const __m256 d = _mm256_setzero_ps();
+    const __m256 p = _mm256_setzero_ps();
+    acc = _mm256_fmadd_ps( d, p, acc );
+    return 0;
+  }
+")
+
+MACRO(CHECK_SSE type flags)
+  SET(__FLAG_I 1)
+  SET(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+  FOREACH(__FLAG ${flags})
+    IF(NOT ${type}_FOUND)
+      SET(CMAKE_REQUIRED_FLAGS ${__FLAG})
+      CHECK_C_SOURCE_RUNS("${${type}_CODE}" HAS_${type}_${__FLAG_I})
+      IF(HAS_${type}_${__FLAG_I})
+        SET(${type}_FOUND TRUE CACHE BOOL "${type} support")
+        SET(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
+      ENDIF()
+      MATH(EXPR __FLAG_I "${__FLAG_I}+1")
+    ENDIF()
+  ENDFOREACH()
+  SET(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+  IF(NOT ${type}_FOUND)
+    SET(${type}_FOUND FALSE CACHE BOOL "${type} support")
+    SET(${type}_FLAGS "" CACHE STRING "${type} flags")
+  ENDIF()
+
+  MARK_AS_ADVANCED(${type}_FOUND ${type}_FLAGS)
+
+ENDMACRO()
+
+CHECK_SSE("AVX" " ;-mavx;/arch:AVX")
+CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
+CHECK_SSE("FMA" " ;-mfma;")
+
+IF(${AVX_FOUND})
+  set(LLAMA_AVX ON)
+ELSE()
+  set(LLAMA_AVX OFF)
+ENDIF()
+
+IF (${FMA_FOUND})
+  set(LLAMA_FMA ON)
+ELSE()
+  set(LLAMA_FMA OFF)
+ENDIF()
+
+IF(${AVX2_FOUND})
+  set(LLAMA_AVX2 ON)
+ELSE()
+  set(LLAMA_AVX2 OFF)
+ENDIF()
+
+IF(${AVX512_FOUND})
+  set(LLAMA_AVX512 ON)
+ELSE()
+  set(LLAMA_AVX512 OFF)
+ENDIF()
+
 #
 # Compile flags
 #

From 5ad9e9531f218dda251c42dde3850ed6d01f5cbc Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Fri, 7 Apr 2023 21:04:47 +0800
Subject: [PATCH 05/14] Only check hardware when option is ON

---
 CMakeLists.txt | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d68ad183bdf0..7a73d72c49e06 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,33 +143,32 @@ MACRO(CHECK_SSE type flags)
 
 ENDMACRO()
 
-CHECK_SSE("AVX" " ;-mavx;/arch:AVX")
-CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2")
-CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
-CHECK_SSE("FMA" " ;-mfma;")
-
-IF(${AVX_FOUND})
-  set(LLAMA_AVX ON)
-ELSE()
-  set(LLAMA_AVX OFF)
+IF(${LLAMA_AVX})
+    CHECK_SSE("AVX" " ;-mavx;/arch:AVX")
+    IF(NOT ${AVX_FOUND})
+        set(LLAMA_AVX OFF)
+    ENDIF()
 ENDIF()
 
-IF (${FMA_FOUND})
-  set(LLAMA_FMA ON)
-ELSE()
-  set(LLAMA_FMA OFF)
+IF(${LLAMA_AVX2})
+    CHECK_SSE("AVX2" " ;-mavx2 -mfma;/arch:AVX2")
+    IF(NOT ${AVX2_FOUND})
+        set(LLAMA_AVX2 OFF)
+    ENDIF()
 ENDIF()
 
-IF(${AVX2_FOUND})
-  set(LLAMA_AVX2 ON)
-ELSE()
-  set(LLAMA_AVX2 OFF)
+IF(${LLAMA_AVX512})
+    CHECK_SSE("AVX512" " ;-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma;/arch:AVX512")
+    IF(NOT ${AVX512_FOUND})
+        set(LLAMA_AVX512 OFF)
+    ENDIF()
 ENDIF()
 
-IF(${AVX512_FOUND})
-  set(LLAMA_AVX512 ON)
-ELSE()
-  set(LLAMA_AVX512 OFF)
+IF(${LLAMA_FMA})
+    CHECK_SSE("FMA" " ;-mfma;")
+    IF (NOT ${FMA_FOUND})
+        set(LLAMA_FMA OFF)
+    ENDIF()
 ENDIF()
 
 #

From c640d2a4bd98666a9f182cdca2d5fd5331f3d818 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Fri, 7 Apr 2023 22:24:14 +0800
Subject: [PATCH 06/14] Remove finalizer

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index d3d216c611bc8..0808d7ff2bbea 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9550,7 +9550,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         if (node->n_tasks > 1) {
             thpool_wait(ctx->tpool);
         }
-
+#if 0
         // FINALIZE
         if (node->n_tasks > 1) {
             // launch thread pool
@@ -9574,7 +9574,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         if (node->n_tasks > 1) {
             thpool_wait(ctx->tpool);
         }
-
+#endif
         // performance stats (node)
         {
             int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;

From 43dde039b05ee8a2c60366a4092856496d3bb4e1 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Fri, 7 Apr 2023 23:51:46 +0800
Subject: [PATCH 07/14] Run second operator when possible

---
 ggml.c | 45 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 0808d7ff2bbea..945f907cd7cb8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2725,9 +2725,9 @@ struct ggml_context_container {
 //
 
 enum ggml_task_type {
-    GGML_TASK_INIT = 0,
-    GGML_TASK_COMPUTE,
-    GGML_TASK_FINALIZE,
+    GGML_TASK_INIT = 1,
+    GGML_TASK_COMPUTE = 2,
+    GGML_TASK_FINALIZE = 4,
 };
 
 struct ggml_compute_params {
@@ -9262,9 +9262,20 @@ struct ggml_compute_state {
 
 static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    int type = state->params.type;
     if (state->node) {
         if (state->params.ith < state->params.nth) {
-            ggml_compute_forward(&state->params, state->node);
+            if (type & GGML_TASK_INIT)
+            {
+                state->params.type = GGML_TASK_INIT;
+                ggml_compute_forward(&state->params, state->node);
+            }
+
+            if (type & GGML_TASK_COMPUTE)
+            {
+                state->params.type = GGML_TASK_COMPUTE;
+                ggml_compute_forward(&state->params, state->node);
+            }
         }
         state->node = NULL;
     }
@@ -9527,6 +9538,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         ggml_compute_forward(&params, node);
 
+        int next_task = 0;
+
         // COMPUTE
         if (node->n_tasks > 1) {
             // launch thread pool
@@ -9542,12 +9555,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
             }
         }
+        else
+        {
+            int start = i;
+            int end = i;
+            if (i + 1 < cgraph->n_nodes)
+            {
+                struct ggml_tensor * next = cgraph->nodes[i + 1];
+                if (next->src0 != node && next->src1 != node && next->n_tasks == 1)
+                {
+                    workers[next_task].params = (struct ggml_compute_params) {
+                        .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
+                        .ith   = 0,
+                        .nth   = 1,
+                        .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
+                        .wdata = cgraph->work ? cgraph->work->data : NULL,
+                    };
+
+                    thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
+                    next_task++;
+                }
+            }
+        }
 
         params.type = GGML_TASK_COMPUTE;
         ggml_compute_forward(&params, node);
 
         // wait for thread pool
-        if (node->n_tasks > 1) {
+        if (node->n_tasks > 1 || next_task != 0) {
             thpool_wait(ctx->tpool);
         }
 #if 0

From 455f6f79bc733a5fb57fc8c3d0e64572b51623f5 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 8 Apr 2023 00:34:05 +0800
Subject: [PATCH 08/14] Try find other single threaded operator to run

---
 ggml.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 945f907cd7cb8..788023d28536e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9557,8 +9557,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
         else
         {
-            int start = i;
-            int end = i;
             if (i + 1 < cgraph->n_nodes)
             {
                 struct ggml_tensor * next = cgraph->nodes[i + 1];
@@ -9568,14 +9566,37 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
                         .ith   = 0,
                         .nth   = 1,
-                        .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                        .wdata = cgraph->work ? cgraph->work->data : NULL,
+                        .wsize = 0,
+                        .wdata = NULL,
                     };
-
+                    workers[next_task].node = next;
                     thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
                     next_task++;
+
+                    if (i + 2 < cgraph->n_nodes)
+                    {
+                        struct ggml_tensor * prev = cgraph->nodes[i + 1];
+                        struct ggml_tensor * next = cgraph->nodes[i + 2];
+                        if (next->src0 != node && next->src1 != node && next->n_tasks == 1 &&
+                            next->src0 != prev && next->src1 != prev
+                        )
+                        {
+                            workers[next_task].params = (struct ggml_compute_params) {
+                                .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
+                                .ith   = 0,
+                                .nth   = 1,
+                                .wsize = 0,
+                                .wdata = NULL,
+                            };
+                            workers[next_task].node = next;
+                            thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
+                            next_task++;
+                        }
+                    }
                 }
             }
+
+
         }
 
         params.type = GGML_TASK_COMPUTE;

From 921296c0d588b933320646bf9513a5be13c749ee Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 8 Apr 2023 00:47:19 +0800
Subject: [PATCH 09/14] avoid malloc/free in critial path

---
 ggml.c   | 11 ++++++-----
 thpool.c | 23 ++---------------------
 thpool.h |  8 +++++++-
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/ggml.c b/ggml.c
index 788023d28536e..93f034d8ff8f7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2731,6 +2731,7 @@ enum ggml_task_type {
 };
 
 struct ggml_compute_params {
+    job newjob;
     enum ggml_task_type type;
 
     int ith, nth;
@@ -9529,11 +9530,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         // INIT
         struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            .type  = GGML_TASK_INIT,
+            .ith   = 0,
+            .nth   = node->n_tasks,
+            .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
+            .wdata = cgraph->work ? cgraph->work->data : NULL,
         };
 
         ggml_compute_forward(&params, node);
diff --git a/thpool.c b/thpool.c
index 76cf5fc3fc97f..6c9a40e329139 100644
--- a/thpool.c
+++ b/thpool.c
@@ -156,14 +156,6 @@ typedef struct bsem {
 } bsem;
 
 
-/* Job */
-typedef struct job{
-	struct job*  prev;                   /* pointer to previous job   */
-	void   (*function)(void* arg);       /* function pointer          */
-	void*  arg;                          /* function's argument       */
-} job;
-
-
 /* Job queue */
 typedef struct jobqueue{
 	pthread_mutex_t rwmutex;             /* used for queue r/w access */
@@ -279,18 +271,8 @@ struct thpool_* thpool_init(int num_threads){
 
 
 /* Add work to the thread pool */
-int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), void* arg_p){
-	job* newjob;
-
-	newjob=(struct job*)malloc(sizeof(struct job));
-	if (newjob==NULL){
-		err("thpool_add_work(): Could not allocate memory for new job\n");
-		return -1;
-	}
-
-	/* add function and argument */
+int thpool_add_work(thpool_* thpool_p, void (*function_p)(void*), job * newjob){
 	newjob->function=function_p;
-	newjob->arg=arg_p;
 
 	/* add job to queue */
 	jobqueue_push(&thpool_p->jobqueue, newjob);
@@ -460,9 +442,8 @@ static void* thread_do(struct thread* thread_p){
 			job* job_p = jobqueue_pull(&thpool_p->jobqueue);
 			if (job_p) {
 				func_buff = job_p->function;
-				arg_buff  = job_p->arg;
+				arg_buff  = job_p;
 				func_buff(arg_buff);
-				free(job_p);
 			}
 
 			pthread_mutex_lock(&thpool_p->thcount_lock);
diff --git a/thpool.h b/thpool.h
index af3e68d165fb0..fafbda5d17220 100644
--- a/thpool.h
+++ b/thpool.h
@@ -16,6 +16,12 @@ extern "C" {
 
 typedef struct thpool_* threadpool;
 
+/* Job */
+typedef struct job{
+	struct job*  prev;                   /* pointer to previous job   */
+	void   (*function)(void* arg);       /* function pointer          */
+} job;
+
 
 /**
  * @brief  Initialize threadpool
@@ -64,7 +70,7 @@ threadpool thpool_init(int num_threads);
  * @param  arg_p         pointer to an argument
  * @return 0 on success, -1 otherwise.
  */
-int thpool_add_work(threadpool, void (*function_p)(void*), void* arg_p);
+int thpool_add_work(threadpool, void (*function_p)(void*), job *newjob);
 
 
 /**

From 3b03df5c05b0cabbf923d7ee3bf89ceca9a17b12 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 8 Apr 2023 19:55:29 +0800
Subject: [PATCH 10/14] look forward more

---
 ggml.c | 80 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/ggml.c b/ggml.c
index 93f034d8ff8f7..d5a190f346858 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9249,16 +9249,10 @@ typedef int ggml_lock_t;
 
 #endif
 
-struct ggml_compute_state_shared {
-    int n_threads;
-};
-
 struct ggml_compute_state {
 
     struct ggml_compute_params params;
     struct ggml_tensor * node;
-
-    struct ggml_compute_state_shared * shared;
 };
 
 static void ggml_graph_compute_thread(void * data) {
@@ -9284,9 +9278,6 @@ static void ggml_graph_compute_thread(void * data) {
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     const int n_threads = cgraph->n_threads;
-    struct ggml_compute_state_shared state_shared = {
-        /*.n_threads =*/ n_threads,
-    };
     struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
 
     // create thread pool
@@ -9302,7 +9293,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     .wdata = cgraph->work ? cgraph->work->data : NULL,
                 },
                 .node   = NULL,
-                .shared = &state_shared,
             };
         }
     }
@@ -9520,6 +9510,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         struct ggml_tensor * node = cgraph->nodes[i];
 
+        if (node->n_tasks == 0)
+        {
+            // no work need to be done.
+            continue;
+        }
         // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
         //if (node->grad == NULL && node->perf_runs > 0) {
         //    continue;
@@ -9558,46 +9553,45 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
         else
         {
-            if (i + 1 < cgraph->n_nodes)
+            int start = i;
+            int end = i + 1;
+            while (end < cgraph->n_nodes && next_task < n_threads && (end - start) < n_threads * 2)
             {
-                struct ggml_tensor * next = cgraph->nodes[i + 1];
-                if (next->src0 != node && next->src1 != node && next->n_tasks == 1)
+                struct ggml_tensor * next = cgraph->nodes[end];
+                end++;
+
+                if (next->n_tasks != 1)
+                    continue;
+
+                // check src depedency
+                bool is_dep = false;
+                for (int k = start; k < end; k++)
                 {
-                    workers[next_task].params = (struct ggml_compute_params) {
-                        .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
-                        .ith   = 0,
-                        .nth   = 1,
-                        .wsize = 0,
-                        .wdata = NULL,
-                    };
-                    workers[next_task].node = next;
-                    thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
-                    next_task++;
-
-                    if (i + 2 < cgraph->n_nodes)
+                    struct ggml_tensor * node = cgraph->nodes[k];
+                    if (next->src0 == node || next->src1 == node)
                     {
-                        struct ggml_tensor * prev = cgraph->nodes[i + 1];
-                        struct ggml_tensor * next = cgraph->nodes[i + 2];
-                        if (next->src0 != node && next->src1 != node && next->n_tasks == 1 &&
-                            next->src0 != prev && next->src1 != prev
-                        )
-                        {
-                            workers[next_task].params = (struct ggml_compute_params) {
-                                .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
-                                .ith   = 0,
-                                .nth   = 1,
-                                .wsize = 0,
-                                .wdata = NULL,
-                            };
-                            workers[next_task].node = next;
-                            thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
-                            next_task++;
-                        }
+                        is_dep = true;
+                        break;
                     }
                 }
-            }
 
+                if (is_dep)
+                    continue;
 
+                workers[next_task].params = (struct ggml_compute_params) {
+                    .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
+                    .ith   = 0,
+                    .nth   = 1,
+                    .wsize = 0,
+                    .wdata = NULL,
+                };
+                workers[next_task].node = next;
+
+                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
+                next->n_tasks = 0; // indicate this node is caculated
+                next_task++;
+                //printf("Combine task [%d, %d]\n", start, end);
+            }
         }
 
         params.type = GGML_TASK_COMPUTE;

From 2035a3cc29223aacf329a7dd6fe10537a9414290 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sun, 9 Apr 2023 22:11:24 +0800
Subject: [PATCH 11/14] avoid to change ggml_task_type

---
 ggml.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index d5a190f346858..c33ad2ca95ef8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2725,9 +2725,9 @@ struct ggml_context_container {
 //
 
 enum ggml_task_type {
-    GGML_TASK_INIT = 1,
-    GGML_TASK_COMPUTE = 2,
-    GGML_TASK_FINALIZE = 4,
+    GGML_TASK_INIT = 0,
+    GGML_TASK_COMPUTE,
+    GGML_TASK_FINALIZE,
 };
 
 struct ggml_compute_params {
@@ -9260,13 +9260,14 @@ static void ggml_graph_compute_thread(void * data) {
     int type = state->params.type;
     if (state->node) {
         if (state->params.ith < state->params.nth) {
-            if (type & GGML_TASK_INIT)
+            if (type == GGML_TASK_INIT)
             {
                 state->params.type = GGML_TASK_INIT;
                 ggml_compute_forward(&state->params, state->node);
+                type = GGML_TASK_COMPUTE;
             }
 
-            if (type & GGML_TASK_COMPUTE)
+            if (type == GGML_TASK_COMPUTE)
             {
                 state->params.type = GGML_TASK_COMPUTE;
                 ggml_compute_forward(&state->params, state->node);
@@ -9579,7 +9580,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     continue;
 
                 workers[next_task].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE | GGML_TASK_INIT,
+                    .type  = GGML_TASK_INIT,
                     .ith   = 0,
                     .nth   = 1,
                     .wsize = 0,

From 6f2a61eb4ff433ca413d66b39ce297bb9ba9cbc0 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Mon, 10 Apr 2023 22:24:27 +0800
Subject: [PATCH 12/14] Rework scheduling algorithm.

---
 ggml.c | 119 +++++++++++++++++++++++++++------------------------------
 1 file changed, 56 insertions(+), 63 deletions(-)

diff --git a/ggml.c b/ggml.c
index c33ad2ca95ef8..cebf43d577584 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9279,23 +9279,22 @@ static void ggml_graph_compute_thread(void * data) {
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     const int n_threads = cgraph->n_threads;
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
+    const int max_requests = n_threads * 5;
+    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*(max_requests));
 
     // create thread pool
-    if (n_threads > 1) {
-        ctx->tpool = thpool_init(n_threads);
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-            };
-        }
+    ctx->tpool = thpool_init(n_threads);
+    for (int j = 0; j < n_threads - 1; j++) {
+        workers[j] = (struct ggml_compute_state) {
+            .params = {
+                .type  = GGML_TASK_COMPUTE,
+                .ith   = j + 1,
+                .nth   = n_threads,
+                .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
+                .wdata = cgraph->work ? cgraph->work->data : NULL,
+            },
+            .node   = NULL,
+        };
     }
 
     // initialize tasks + work buffer
@@ -9505,6 +9504,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
+    const size_t wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0;
+    const void*  wdata = cgraph->work ? cgraph->work->data : NULL;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
@@ -9524,52 +9525,31 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         const int64_t perf_node_start_cycles  = ggml_perf_cycles();
         const int64_t perf_node_start_time_us = ggml_perf_time_us();
 
-        // INIT
-        struct ggml_compute_params params = {
-            .type  = GGML_TASK_INIT,
-            .ith   = 0,
-            .nth   = node->n_tasks,
-            .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            .wdata = cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
         int next_task = 0;
 
         // COMPUTE
-        if (node->n_tasks > 1) {
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[j]);
-            }
-        }
-        else
         {
             int start = i;
-            int end = i + 1;
-            while (end < cgraph->n_nodes && next_task < n_threads && (end - start) < n_threads * 2)
+            int end = i;
+            while (end < cgraph->n_nodes && (end - start) < n_threads * 2)
             {
                 struct ggml_tensor * next = cgraph->nodes[end];
                 end++;
 
-                if (next->n_tasks != 1)
+                // already scheduled
+                if (next->n_tasks == 0)
+                    continue;
+
+                // if we have slots
+                if (next_task + next->n_tasks > max_requests)
                     continue;
 
                 // check src depedency
                 bool is_dep = false;
                 for (int k = start; k < end; k++)
                 {
-                    struct ggml_tensor * node = cgraph->nodes[k];
-                    if (next->src0 == node || next->src1 == node)
+                    struct ggml_tensor * prev = cgraph->nodes[k];
+                    if (next->src0 == prev || next->src1 == prev)
                     {
                         is_dep = true;
                         break;
@@ -9579,29 +9559,42 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 if (is_dep)
                     continue;
 
-                workers[next_task].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_INIT,
-                    .ith   = 0,
-                    .nth   = 1,
-                    .wsize = 0,
-                    .wdata = NULL,
-                };
-                workers[next_task].node = next;
+                if (next->n_tasks > 1)
+                {
+                    // run INIT in main thread if it is multi thread operator
+                    struct ggml_compute_params params = {
+                        .type  = GGML_TASK_INIT,
+                        .ith   = 0,
+                        .nth   = next->n_tasks,
+                        .wsize = wsize,
+                        .wdata = wdata,
+                    };
+
+                    ggml_compute_forward(&params, next);
+                }
 
-                thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
+                for (int j = 0; j < next->n_tasks; j++) {
+                    workers[next_task].params = (struct ggml_compute_params){
+                        // single thread operator runs INIT in worker thread
+                        .type = next->n_tasks == 1 ? GGML_TASK_INIT : GGML_TASK_COMPUTE,
+                        .ith = j,
+                        .nth = next->n_tasks,
+
+                        // TODO: Potential race on wdata
+                        .wsize = wsize,
+                        .wdata = wdata,
+                    };
+                    workers[next_task].node = next;
+
+                    thpool_add_work(ctx->tpool, ggml_graph_compute_thread, &workers[next_task]);
+                    next_task++;
+                }
                 next->n_tasks = 0; // indicate this node is caculated
-                next_task++;
-                //printf("Combine task [%d, %d]\n", start, end);
             }
         }
 
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
         // wait for thread pool
-        if (node->n_tasks > 1 || next_task != 0) {
-            thpool_wait(ctx->tpool);
-        }
+        thpool_wait(ctx->tpool);
 #if 0
         // FINALIZE
         if (node->n_tasks > 1) {

From 6d18c6ea3e0097594a67df43bd9bf7dd05c07df3 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Mon, 10 Apr 2023 22:37:10 +0800
Subject: [PATCH 13/14] Fix the number of forward looking nodes

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index cebf43d577584..3004e9932ace9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9531,7 +9531,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         {
             int start = i;
             int end = i;
-            while (end < cgraph->n_nodes && (end - start) < n_threads * 2)
+            while (end < cgraph->n_nodes && (end - start) < 48)
             {
                 struct ggml_tensor * next = cgraph->nodes[end];
                 end++;

From 94ddd6204ce6e0c0f51adc50e9c3f8211f4038d2 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Mon, 10 Apr 2023 22:37:37 +0800
Subject: [PATCH 14/14] Simplify the logic of scheduling

---
 ggml.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3004e9932ace9..c8a14964576ba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9258,22 +9258,17 @@ struct ggml_compute_state {
 static void ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     int type = state->params.type;
-    if (state->node) {
-        if (state->params.ith < state->params.nth) {
-            if (type == GGML_TASK_INIT)
-            {
-                state->params.type = GGML_TASK_INIT;
-                ggml_compute_forward(&state->params, state->node);
-                type = GGML_TASK_COMPUTE;
-            }
+    if (type == GGML_TASK_INIT)
+    {
+        state->params.type = GGML_TASK_INIT;
+        ggml_compute_forward(&state->params, state->node);
+        type = GGML_TASK_COMPUTE;
+    }
 
-            if (type == GGML_TASK_COMPUTE)
-            {
-                state->params.type = GGML_TASK_COMPUTE;
-                ggml_compute_forward(&state->params, state->node);
-            }
-        }
-        state->node = NULL;
+    if (type == GGML_TASK_COMPUTE)
+    {
+        state->params.type = GGML_TASK_COMPUTE;
+        ggml_compute_forward(&state->params, state->node);
     }
 }
 
@@ -9284,18 +9279,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     // create thread pool
     ctx->tpool = thpool_init(n_threads);
-    for (int j = 0; j < n_threads - 1; j++) {
-        workers[j] = (struct ggml_compute_state) {
-            .params = {
-                .type  = GGML_TASK_COMPUTE,
-                .ith   = j + 1,
-                .nth   = n_threads,
-                .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                .wdata = cgraph->work ? cgraph->work->data : NULL,
-            },
-            .node   = NULL,
-        };
-    }
 
     // initialize tasks + work buffer
     {