1
1
#include " llama.h"
2
-
3
2
#include " ggml.h"
4
-
5
3
#include " ggml-alloc.h"
4
+ #include " threadpool.h"
6
5
7
6
#ifdef GGML_USE_CUBLAS
8
7
# include " ggml-cuda.h"
60
59
#include < cstring>
61
60
#include < ctime>
62
61
#include < fstream>
62
+ #include < functional>
63
63
#include < initializer_list>
64
64
#include < map>
65
65
#include < memory>
@@ -4640,8 +4640,8 @@ struct no_init {
4640
4640
};
4641
4641
4642
4642
static void llama_convert_tensor_internal (
4643
- struct ggml_tensor * tensor, std::vector<no_init<float >> & output, std::vector<std::thread> & workers ,
4644
- const size_t nelements, const int nthread
4643
+ thread_pool< void > & pool, struct ggml_tensor * tensor, std::vector<no_init<float >> & output, const size_t nelements ,
4644
+ const int nthread
4645
4645
) {
4646
4646
if (output.size () < nelements) {
4647
4647
output.resize (nelements);
@@ -4677,6 +4677,8 @@ static void llama_convert_tensor_internal(
4677
4677
auto blocks_per_thread = nblocks / nthread;
4678
4678
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4679
4679
4680
+ std::vector<std::future<void >> workers;
4681
+ workers.reserve (nthread);
4680
4682
for (auto tnum = 0 , in_buff_offs = 0 , out_buff_offs = 0 ; tnum < nthread; tnum++) {
4681
4683
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0 ); // num blocks for this thread
4682
4684
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4689,12 +4691,14 @@ static void llama_convert_tensor_internal(
4689
4691
qtype.to_float (inbuf, outbuf, nels);
4690
4692
}
4691
4693
};
4692
- workers.emplace_back (compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4694
+ auto future = pool.push (std::bind (
4695
+ compute, tensor->type , (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems
4696
+ ));
4697
+ workers.push_back (std::move (future));
4693
4698
in_buff_offs += thr_block_bytes;
4694
4699
out_buff_offs += thr_elems;
4695
4700
}
4696
- for (auto & w : workers) { w.join (); }
4697
- workers.clear ();
4701
+ for (auto & w : workers) { w.wait (); }
4698
4702
}
4699
4703
4700
4704
#ifdef GGML_USE_K_QUANTS
@@ -4892,8 +4896,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4892
4896
size_t total_size_new = 0 ;
4893
4897
std::vector<int64_t > hist_all (1 << 4 , 0 );
4894
4898
4895
- std::vector<std::thread> workers;
4896
- workers.reserve (nthread);
4899
+ std::vector<std::future<void >> workers;
4900
+ workers.reserve (nthread - 1 );
4901
+ thread_pool<void > pool (nthread);
4897
4902
std::mutex mutex;
4898
4903
4899
4904
int idx = 0 ;
@@ -4974,7 +4979,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4974
4979
} else if (ggml_is_quantized (tensor->type ) && !params->allow_requantize ) {
4975
4980
throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor->type )));
4976
4981
} else {
4977
- llama_convert_tensor_internal (tensor, f32_conv_buf, workers , nelements, nthread);
4982
+ llama_convert_tensor_internal (pool, tensor, f32_conv_buf , nelements, nthread);
4978
4983
f32_data = (float *) f32_conv_buf.data ();
4979
4984
}
4980
4985
@@ -5016,10 +5021,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
5016
5021
}
5017
5022
};
5018
5023
for (int it = 0 ; it < nthread_use - 1 ; ++it) {
5019
- workers.emplace_back ( compute);
5024
+ workers.push_back (pool. push ( compute) );
5020
5025
}
5021
5026
compute ();
5022
- for (auto & w : workers) { w.join (); }
5027
+ for (auto & w : workers) { w.wait (); }
5023
5028
workers.clear ();
5024
5029
}
5025
5030
0 commit comments