llama : use a thread pool for quantize -> 1.3% faster

cebtenzzre · cebtenzzre · commit 97563d376fd2 · 2023-09-10T19:41:33.000-04:00
diff --git a/Makefile b/Makefile
@@ -473,7 +473,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 
 OBJS += ggml-alloc.o
 
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h threadpool.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: common/common.cpp common/common.h build-info.h common/log.h
diff --git a/llama.cpp b/llama.cpp
@@ -1,8 +1,7 @@
 #include "llama.h"
-
 #include "ggml.h"
-
 #include "ggml-alloc.h"
+#include "threadpool.h"
 
 #ifdef GGML_USE_CUBLAS
 #  include "ggml-cuda.h"
@@ -60,6 +59,7 @@
 #include <cstring>
 #include <ctime>
 #include <fstream>
+#include <functional>
 #include <initializer_list>
 #include <map>
 #include <memory>
@@ -4640,8 +4640,8 @@ struct no_init {
 };
 
 static void llama_convert_tensor_internal(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
-    const size_t nelements, const int nthread
+    thread_pool<void> & pool, struct ggml_tensor * tensor, std::vector<no_init<float>> & output, const size_t nelements,
+    const int nthread
 ) {
     if (output.size() < nelements) {
         output.resize(nelements);
@@ -4677,6 +4677,8 @@ static void llama_convert_tensor_internal(
     auto blocks_per_thread = nblocks / nthread;
     auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
 
+    std::vector<std::future<void>> workers;
+    workers.reserve(nthread);
     for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
         auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
         auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4689,12 +4691,14 @@ static void llama_convert_tensor_internal(
                 qtype.to_float(inbuf, outbuf, nels);
             }
         };
-        workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
+        auto future = pool.push(std::bind(
+            compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems
+        ));
+        workers.push_back(std::move(future));
         in_buff_offs += thr_block_bytes;
         out_buff_offs += thr_elems;
     }
-    for (auto & w : workers) { w.join(); }
-    workers.clear();
+    for (auto & w : workers) { w.wait(); }
 }
 
 #ifdef GGML_USE_K_QUANTS
@@ -4892,8 +4896,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     size_t total_size_new = 0;
     std::vector<int64_t> hist_all(1 << 4, 0);
 
-    std::vector<std::thread> workers;
-    workers.reserve(nthread);
+    std::vector<std::future<void>> workers;
+    workers.reserve(nthread - 1);
+    thread_pool<void> pool(nthread);
     std::mutex mutex;
 
     int idx = 0;
@@ -4974,7 +4979,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
             } else {
-                llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
+                llama_convert_tensor_internal(pool, tensor, f32_conv_buf, nelements, nthread);
                 f32_data = (float *) f32_conv_buf.data();
             }
 
@@ -5016,10 +5021,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                     }
                 };
                 for (int it = 0; it < nthread_use - 1; ++it) {
-                    workers.emplace_back(compute);
+                    workers.push_back(pool.push(compute));
                 }
                 compute();
-                for (auto & w : workers) { w.join(); }
+                for (auto & w : workers) { w.wait(); }
                 workers.clear();
             }
 
diff --git a/threadpool.h b/threadpool.h
@@ -0,0 +1,82 @@
+#include <atomic>
+#include <condition_variable>
+#include <cassert>
+#include <cstddef>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+template <typename R>
+class thread_pool {
+    static constexpr size_t max_tasks = 1024;
+    using task = std::packaged_task<R ()>;
+    std::atomic<bool>        m_abort = {};
+    std::vector<std::thread> m_threads;
+    std::queue<task>         m_tasks;
+    std::condition_variable  m_ready, m_not_full;
+    std::mutex               m_mutex;
+
+public:
+    explicit thread_pool(size_t nthreads);
+    ~thread_pool();
+
+    std::future<R> push(std::function<R ()> f);
+
+    thread_pool(thread_pool &)  = delete;
+    thread_pool(thread_pool &&) = delete;
+    void operator=(thread_pool &)  = delete;
+    void operator=(thread_pool &&) = delete;
+
+protected:
+    void process_tasks();
+};
+
+template <typename R>
+thread_pool<R>::thread_pool(size_t nthreads) {
+    m_threads.reserve(nthreads);
+    for (size_t i = 0; i < nthreads; ++i) {
+        m_threads.emplace_back([this]() { process_tasks(); });
+    }
+}
+
+template <typename R>
+thread_pool<R>::~thread_pool() {
+    m_abort = true;
+    m_ready.notify_all();
+    for (auto & thread : m_threads) { thread.join(); }
+}
+
+template <typename R>
+std::future<R> thread_pool<R>::push(std::function<R ()> f) {
+    auto t = task(std::move(f));
+    auto r = t.get_future();
+    std::unique_lock<std::mutex> lock(m_mutex);
+    m_not_full.wait(lock, [this]() { return m_tasks.size() < max_tasks; });
+    m_tasks.emplace(std::move(t));
+    lock.unlock();
+    m_ready.notify_one();
+    return r;
+}
+
+template <typename R>
+void thread_pool<R>::process_tasks() {
+    task t;
+    for (;;) {
+        std::unique_lock<std::mutex> lock(m_mutex);
+        for (;;) {
+            if (m_abort) { return; }
+            if (!m_tasks.empty()) { break; }
+            m_ready.wait(lock);
+        }
+        t = std::move(m_tasks.front());
+        m_tasks.pop();
+        lock.unlock();
+        m_not_full.notify_one();
+
+        t();
+    }
+}