Skip to content

Commit 96c8042

Browse files
committed
process one tensor at a time with k-quants
k-quants are less I/O bottlenecked because they are more CPU intensive, so they do not seem to benefit from the extra threads.
1 parent 7a4de4c commit 96c8042

File tree

1 file changed

+21
-7
lines changed

1 file changed

+21
-7
lines changed

llama.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4894,11 +4894,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48944894
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
48954895
}
48964896

4897-
int nthread;
4897+
int nthread, nthread2 = 1;
48984898
if (params->nthread > 0) {
48994899
nthread = params->nthread;
49004900
} else {
4901-
nthread = std::max(2u, std::thread::hardware_concurrency()) / 2;
4901+
nthread = std::thread::hardware_concurrency();
4902+
}
4903+
switch (quantized_type) {
4904+
case GGML_TYPE_Q4_0:
4905+
case GGML_TYPE_Q4_1:
4906+
case GGML_TYPE_Q5_0:
4907+
case GGML_TYPE_Q5_1:
4908+
case GGML_TYPE_Q8_0:
4909+
case GGML_TYPE_F16:
4910+
case GGML_TYPE_F32:
4911+
// roughly double overall thread count for fast quants
4912+
// these numbers were chosen empirically and may not be ideal in all cases
4913+
nthread2 = 4;
4914+
nthread = std::max(2, nthread) * 5 / 4;
4915+
default:
4916+
;
49024917
}
49034918

49044919
thread_pool<void> pool(nthread);
@@ -4953,9 +4968,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49534968
size_t total_size_new = 0;
49544969
std::vector<int64_t> hist_all(1 << 4, 0);
49554970

4956-
const int nthreads2 = 4;
4957-
std::vector<std::vector<no_init<uint8_t>>> read_data_pool(nthreads2);
4958-
std::vector<std::vector<no_init<uint8_t>>> work_pool(nthreads2);
4971+
std::vector<std::vector<no_init<uint8_t>>> read_data_pool(nthread2);
4972+
std::vector<std::vector<no_init<uint8_t>>> work_pool(nthread2);
49594973

49604974
// populate the original tensors so we get an initial meta data
49614975
for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4972,7 +4986,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
49724986
// placeholder for the meta data
49734987
fout.seek(meta_size, SEEK_SET);
49744988

4975-
std::vector<std::vector<no_init<float>>> f32_conv_buf_pool(nthreads2);
4989+
std::vector<std::vector<no_init<float>>> f32_conv_buf_pool(nthread2);
49764990
std::mutex log_mutex;
49774991

49784992
std::vector<ggml_type> quant_tensor_types;
@@ -5139,7 +5153,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
51395153
};
51405154

51415155
{
5142-
thread_pool<void> pool2(nthreads2);
5156+
thread_pool<void> pool2(nthread2);
51435157
for (int i = 0; i < ml->n_tensors; ++i) {
51445158
pool2.push([&big_loop, i](int tid) { big_loop(i, tid); });
51455159
}

0 commit comments

Comments
 (0)