Faster quantize

Kawrakow · Kawrakow · commit da030ed0635e · 2023-09-12T11:29:45.000+02:00
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -154,12 +154,23 @@ int main(int argc, char ** argv) {
     if (argc > arg_idx) {
         try {
             params.nthread = std::stoi(argv[arg_idx]);
+            ++arg_idx;
         }
         catch (const std::exception & e) {
             fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
             return 1;
         }
     }
+    if (argc > arg_idx) {
+        try {
+            params.collect_histo = std::stoi(argv[arg_idx]);
+            ++arg_idx;
+        }
+        catch (const std::exception & e) {
+            fprintf(stderr, "%s: invalid collect_histo '%s' (%s)\n", __func__, argv[arg_idx], e.what());
+            return 1;
+        }
+    }
 
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
 
diff --git a/ggml.c b/ggml.c
@@ -19366,13 +19366,15 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
 
         quantize_row_q4_0_reference(src + b, y, k);
 
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_0; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
+        if (hist) {
+            for (int i = 0; i < nb; i++) {
+                for (int j = 0; j < QK4_0; j += 2) {
+                    const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                    const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                    hist[vi0]++;
+                    hist[vi1]++;
+                }
             }
         }
     }
@@ -19389,13 +19391,15 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 
         quantize_row_q4_1_reference(src + b, y, k);
 
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK4_1; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+        if (hist) {
+            for (int i = 0; i < nb; i++) {
+                for (int j = 0; j < QK4_1; j += 2) {
+                    const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                    const uint8_t vi1 = y[i].qs[j/2] >> 4;
 
-                hist[vi0]++;
-                hist[vi1]++;
+                    hist[vi0]++;
+                    hist[vi1]++;
+                }
             }
         }
     }
@@ -19412,20 +19416,22 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
 
         quantize_row_q5_0_reference(src + b, y, k);
 
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
+        if (hist) {
+            for (int i = 0; i < nb; i++) {
+                uint32_t qh;
+                memcpy(&qh, &y[i].qh, sizeof(qh));
 
-            for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                for (int j = 0; j < QK5_0; j += 2) {
+                    const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                    const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+                    // cast to 16 bins
+                    const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                    const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
 
-                hist[vi0]++;
-                hist[vi1]++;
+                    hist[vi0]++;
+                    hist[vi1]++;
+                }
             }
         }
     }
@@ -19442,20 +19448,22 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
 
         quantize_row_q5_1_reference(src + b, y, k);
 
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
+        if (hist) {
+            for (int i = 0; i < nb; i++) {
+                uint32_t qh;
+                memcpy(&qh, &y[i].qh, sizeof(qh));
 
-            for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                for (int j = 0; j < QK5_1; j += 2) {
+                    const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                    const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
 
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+                    // cast to 16 bins
+                    const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                    const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
 
-                hist[vi0]++;
-                hist[vi1]++;
+                    hist[vi0]++;
+                    hist[vi1]++;
+                }
             }
         }
     }
@@ -19472,11 +19480,13 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
 
         quantize_row_q8_0_reference(src + b, y, k);
 
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < QK8_0; ++j) {
-                const int8_t vi = y[i].qs[j];
+        if (hist) {
+            for (int i = 0; i < nb; i++) {
+                for (int j = 0; j < QK8_0; ++j) {
+                    const int8_t vi = y[i].qs[j];
 
-                hist[vi/16 + 8]++;
+                    hist[vi/16 + 8]++;
+                }
             }
         }
     }
diff --git a/llama.cpp b/llama.cpp
@@ -4726,6 +4726,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     }
 
     int nthread = params->nthread;
+    bool collect_histo = params->collect_histo;
 
     if (nthread <= 0) {
         nthread = std::thread::hardware_concurrency();
@@ -4808,6 +4809,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     ::zeros(fout, meta_size);
 
+    std::vector<float> f32_conv_buf;
+
     for (int i = 0; i < ml->n_tensors; ++i) {
         struct ggml_tensor * tensor = ml->get_tensor_meta(i);
 
@@ -4947,7 +4950,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             const size_t nelements = ggml_nelements(tensor);
 
             float * f32_data;
-            std::vector<float> f32_conv_buf;
 
             if (tensor->type == GGML_TYPE_F32) {
                 f32_data = (float *) tensor->data;
@@ -4963,7 +4965,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
             work.resize(nelements * 4); // upper bound on size
             new_data = work.data();
-            std::vector<int64_t> hist_cur(1 << 4, 0);
+            std::vector<int64_t> hist_cur;
+            if (collect_histo) {
+                hist_cur.resize(1 << 4, 0);
+            }
 
             static const int chunk_size = 32 * 512;
             const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4990,7 +4995,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                         }
                         lock.unlock();
                         size_t last = std::min(nelements, first + chunk_size);
-                        if (local_hist.empty()) {
+                        if (local_hist.empty() && !hist_cur.empty()) {
                             local_hist.resize(hist_cur.size(), 0);
                         }
                         local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
@@ -5379,6 +5384,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
         /*.allow_requantize            =*/ false,
         /*.quantize_output_tensor      =*/ true,
         /*.only_copy                   =*/ false,
+        /*.collect_histo               =*/ false,
     };
 
     return result;
diff --git a/llama.h b/llama.h
@@ -165,6 +165,7 @@ extern "C" {
         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
         bool quantize_output_tensor; // quantize output.weight
         bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool collect_histo;          // collect quant histogram when quantizing?
     } llama_model_quantize_params;
 
     // grammar types