Skip to content

Commit 40ff9fa

Browse files
cebtenzzreggerganov
authored andcommitted
llama : quantize up to 31% faster on Linux and Windows with mmap (ggml-org#3206)
* llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent f5eb5f6 commit 40ff9fa

File tree

1 file changed

+17
-4
lines changed

1 file changed

+17
-4
lines changed

llama.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
60276027
nthread = std::thread::hardware_concurrency();
60286028
}
60296029

6030-
llama_model_loader ml(fname_inp, /*use_mmap*/ false);
6030+
// mmap consistently increases speed Linux, and also increases speed on Windows with
6031+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6032+
#if defined(__linux__) || defined(_WIN32)
6033+
constexpr bool use_mmap = true;
6034+
#else
6035+
constexpr bool use_mmap = false;
6036+
#endif
6037+
6038+
llama_model_loader ml(fname_inp, use_mmap);
6039+
if (ml.use_mmap) {
6040+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
6041+
}
60316042

60326043
llama_model model;
60336044
llm_load_arch(ml, model);
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
61056116

61066117
const std::string name = ggml_get_name(tensor);
61076118

6108-
if (read_data.size() < ggml_nbytes(tensor)) {
6109-
read_data.resize(ggml_nbytes(tensor));
6119+
if (!ml.use_mmap) {
6120+
if (read_data.size() < ggml_nbytes(tensor)) {
6121+
read_data.resize(ggml_nbytes(tensor));
6122+
}
6123+
tensor->data = read_data.data();
61106124
}
6111-
tensor->data = read_data.data();
61126125
ml.load_data_for(tensor);
61136126

61146127
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",

0 commit comments

Comments
 (0)