process one tensor at a time with k-quants

cebtenzzre · cebtenzzre · commit 96c804288419 · 2023-09-11T13:14:21.000-04:00
k-quants are less I/O bottlenecked because they are more CPU intensive,
so they do not seem to benefit from the extra threads.
diff --git a/llama.cpp b/llama.cpp
@@ -4894,11 +4894,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
 
-    int nthread;
+    int nthread, nthread2 = 1;
     if (params->nthread > 0) {
         nthread = params->nthread;
     } else {
-        nthread = std::max(2u, std::thread::hardware_concurrency()) / 2;
+        nthread = std::thread::hardware_concurrency();
+    }
+    switch (quantized_type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            // roughly double overall thread count for fast quants
+            // these numbers were chosen empirically and may not be ideal in all cases
+            nthread2 = 4;
+            nthread  = std::max(2, nthread) * 5 / 4;
+        default:
+            ;
     }
 
     thread_pool<void> pool(nthread);
@@ -4953,9 +4968,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     size_t total_size_new = 0;
     std::vector<int64_t> hist_all(1 << 4, 0);
 
-    const int nthreads2 = 4;
-    std::vector<std::vector<no_init<uint8_t>>> read_data_pool(nthreads2);
-    std::vector<std::vector<no_init<uint8_t>>> work_pool(nthreads2);
+    std::vector<std::vector<no_init<uint8_t>>> read_data_pool(nthread2);
+    std::vector<std::vector<no_init<uint8_t>>> work_pool(nthread2);
 
     // populate the original tensors so we get an initial meta data
     for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4972,7 +4986,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     fout.seek(meta_size, SEEK_SET);
 
-    std::vector<std::vector<no_init<float>>> f32_conv_buf_pool(nthreads2);
+    std::vector<std::vector<no_init<float>>> f32_conv_buf_pool(nthread2);
     std::mutex log_mutex;
 
     std::vector<ggml_type> quant_tensor_types;
@@ -5139,7 +5153,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     };
 
     {
-        thread_pool<void> pool2(nthreads2);
+        thread_pool<void> pool2(nthread2);
         for (int i = 0; i < ml->n_tensors; ++i) {
             pool2.push([&big_loop, i](int tid) { big_loop(i, tid); });
         }