@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6027
6027
nthread = std::thread::hardware_concurrency ();
6028
6028
}
6029
6029
6030
- llama_model_loader ml (fname_inp, /* use_mmap*/ false );
6030
+ // mmap consistently increases speed Linux, and also increases speed on Windows with
6031
+ // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
6032
+ #if defined(__linux__) || defined(_WIN32)
6033
+ constexpr bool use_mmap = true ;
6034
+ #else
6035
+ constexpr bool use_mmap = false ;
6036
+ #endif
6037
+
6038
+ llama_model_loader ml (fname_inp, use_mmap);
6039
+ if (ml.use_mmap ) {
6040
+ ml.mapping .reset (new llama_mmap (&ml.file , /* prefetch */ 0 , ggml_is_numa ()));
6041
+ }
6031
6042
6032
6043
llama_model model;
6033
6044
llm_load_arch (ml, model);
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
6105
6116
6106
6117
const std::string name = ggml_get_name (tensor);
6107
6118
6108
- if (read_data.size () < ggml_nbytes (tensor)) {
6109
- read_data.resize (ggml_nbytes (tensor));
6119
+ if (!ml.use_mmap ) {
6120
+ if (read_data.size () < ggml_nbytes (tensor)) {
6121
+ read_data.resize (ggml_nbytes (tensor));
6122
+ }
6123
+ tensor->data = read_data.data ();
6110
6124
}
6111
- tensor->data = read_data.data ();
6112
6125
ml.load_data_for (tensor);
6113
6126
6114
6127
LLAMA_LOG_INFO (" [%4d/%4d] %36s - [%s], type = %6s, " ,
0 commit comments