@@ -4894,11 +4894,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4894
4894
default : throw std::runtime_error (format (" invalid output file type %d\n " , ftype));
4895
4895
}
4896
4896
4897
- int nthread;
4897
+ int nthread, nthread2 = 1 ;
4898
4898
if (params->nthread > 0 ) {
4899
4899
nthread = params->nthread ;
4900
4900
} else {
4901
- nthread = std::max (2u , std::thread::hardware_concurrency ()) / 2 ;
4901
+ nthread = std::thread::hardware_concurrency ();
4902
+ }
4903
+ switch (quantized_type) {
4904
+ case GGML_TYPE_Q4_0:
4905
+ case GGML_TYPE_Q4_1:
4906
+ case GGML_TYPE_Q5_0:
4907
+ case GGML_TYPE_Q5_1:
4908
+ case GGML_TYPE_Q8_0:
4909
+ case GGML_TYPE_F16:
4910
+ case GGML_TYPE_F32:
4911
+ // roughly double overall thread count for fast quants
4912
+ // these numbers were chosen empirically and may not be ideal in all cases
4913
+ nthread2 = 4 ;
4914
+ nthread = std::max (2 , nthread) * 5 / 4 ;
4915
+ default :
4916
+ ;
4902
4917
}
4903
4918
4904
4919
thread_pool<void > pool (nthread);
@@ -4953,9 +4968,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4953
4968
size_t total_size_new = 0 ;
4954
4969
std::vector<int64_t > hist_all (1 << 4 , 0 );
4955
4970
4956
- const int nthreads2 = 4 ;
4957
- std::vector<std::vector<no_init<uint8_t >>> read_data_pool (nthreads2);
4958
- std::vector<std::vector<no_init<uint8_t >>> work_pool (nthreads2);
4971
+ std::vector<std::vector<no_init<uint8_t >>> read_data_pool (nthread2);
4972
+ std::vector<std::vector<no_init<uint8_t >>> work_pool (nthread2);
4959
4973
4960
4974
// populate the original tensors so we get an initial meta data
4961
4975
for (int i = 0 ; i < ml->n_tensors ; ++i) {
@@ -4972,7 +4986,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4972
4986
// placeholder for the meta data
4973
4987
fout.seek (meta_size, SEEK_SET);
4974
4988
4975
- std::vector<std::vector<no_init<float >>> f32_conv_buf_pool (nthreads2 );
4989
+ std::vector<std::vector<no_init<float >>> f32_conv_buf_pool (nthread2 );
4976
4990
std::mutex log_mutex;
4977
4991
4978
4992
std::vector<ggml_type> quant_tensor_types;
@@ -5139,7 +5153,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5139
5153
};
5140
5154
5141
5155
{
5142
- thread_pool<void > pool2 (nthreads2 );
5156
+ thread_pool<void > pool2 (nthread2 );
5143
5157
for (int i = 0 ; i < ml->n_tensors ; ++i) {
5144
5158
pool2.push ([&big_loop, i](int tid) { big_loop (i, tid); });
5145
5159
}
0 commit comments