4747#include < algorithm>
4848#include < initializer_list>
4949#include < thread>
50- #include < atomic>
5150#include < mutex>
5251#include < sstream>
5352#include < numeric>
@@ -92,6 +91,53 @@ static const size_t MB = 1024*1024;
9291
9392typedef void (*offload_func_t )(struct ggml_tensor * tensor);
9493
94+ #ifdef GGML_USE_CUBLAS
95+ #define llama_host_malloc (n ) ggml_cuda_host_malloc(n)
96+ #define llama_host_free (data ) ggml_cuda_host_free(data)
97+ #elif GGML_USE_METAL
98+ #define llama_host_malloc (n ) ggml_metal_host_malloc(n)
99+ #define llama_host_free (data ) ggml_metal_host_free(data)
100+ #else
101+ #define llama_host_malloc (n ) malloc(n)
102+ #define llama_host_free (data ) free(data)
103+ #endif
104+
105+ struct llama_buffer {
106+ void * data = NULL ;
107+ size_t size = 0 ;
108+
109+ // fallback to malloc / free
110+ // useful in cases where CUDA can try to allocate PINNED memory
111+ bool fallback = false ;
112+
113+ void resize (size_t n) {
114+ llama_host_free (data);
115+
116+ data = llama_host_malloc (n);
117+ if (!data) {
118+ fallback = true ;
119+ data = malloc (n);
120+ } else {
121+ fallback = false ;
122+ }
123+
124+ GGML_ASSERT (data);
125+ size = n;
126+ }
127+
128+ ~llama_buffer () {
129+ if (data) {
130+ if (fallback) { // NOLINT
131+ free (data);
132+ } else {
133+ llama_host_free (data);
134+ }
135+ }
136+
137+ data = NULL ;
138+ }
139+ };
140+
95141void llama_nop (struct ggml_tensor * tensor) { // don't offload by default
96142 (void ) tensor;
97143}
@@ -254,7 +300,7 @@ struct llama_kv_cache {
254300
255301 struct ggml_context * ctx = NULL ;
256302
257- gguf_ctx_buffer buf;
303+ llama_buffer buf;
258304
259305 int n; // number of tokens currently in the cache
260306
@@ -305,7 +351,7 @@ struct llama_model {
305351 struct ggml_context * ctx = NULL ;
306352
307353 // the model memory buffer
308- gguf_ctx_buffer buf;
354+ llama_buffer buf;
309355
310356 // model memory mapped file
311357 std::unique_ptr<gguf_mmap> mapping;
@@ -394,15 +440,15 @@ struct llama_context {
394440
395441 // memory buffers used to evaluate the model
396442 // TODO: move in llama_state
397- gguf_ctx_buffer buf_compute;
443+ llama_buffer buf_compute;
398444
399445#ifdef LLAMA_USE_ALLOCATOR
400- gguf_ctx_buffer buf_alloc;
446+ llama_buffer buf_alloc;
401447 ggml_allocr * alloc = NULL ;
402448#endif
403449
404450#ifdef LLAMA_USE_SCRATCH
405- gguf_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
451+ llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
406452
407453 int buf_last = 0 ;
408454 size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -416,15 +462,15 @@ struct llama_context {
416462 ggml_mpi_context * ctx_mpi = NULL ;
417463#endif
418464
419- void use_buf (struct ggml_context * ctx, int i) {
465+ static void use_buf (struct ggml_context * ctx, int i) {
420466#if defined(LLAMA_USE_SCRATCH)
421467 size_t last_size = 0 ;
422468
423469 if (i == -1 ) {
424470 last_size = ggml_set_scratch (ctx, { 0 , 0 , nullptr , });
425471 } else {
426472 auto & buf = buf_scratch[i];
427- last_size = ggml_set_scratch (ctx, { 0 , buf.size , buf.addr , });
473+ last_size = ggml_set_scratch (ctx, { 0 , buf.size , buf.data , });
428474 }
429475
430476 if (buf_last >= 0 ) {
@@ -438,7 +484,7 @@ struct llama_context {
438484#endif
439485 }
440486
441- size_t get_buf_max_mem (int i) const {
487+ static size_t get_buf_max_mem (int i) {
442488#if defined(LLAMA_USE_SCRATCH)
443489 return buf_max_size[i];
444490#else
@@ -1024,7 +1070,7 @@ static bool kv_cache_init(
10241070
10251071 struct ggml_init_params params;
10261072 params.mem_size = cache.buf .size ;
1027- params.mem_buffer = cache.buf .addr ;
1073+ params.mem_buffer = cache.buf .data ;
10281074 params.no_alloc = false ;
10291075
10301076 cache.ctx = ggml_init (params);
@@ -1275,13 +1321,13 @@ static void llama_model_load_internal(
12751321 {
12761322 model.buf .resize (ctx_size);
12771323 if (use_mlock) {
1278- model.mlock_buf .init (model.buf .addr );
1324+ model.mlock_buf .init (model.buf .data );
12791325 model.mlock_buf .grow_to (model.buf .size );
12801326 }
12811327
12821328 struct ggml_init_params params = {
12831329 /* .mem_size =*/ model.buf .size ,
1284- /* .mem_buffer =*/ model.buf .addr ,
1330+ /* .mem_buffer =*/ model.buf .data ,
12851331 /* .no_alloc =*/ ml->use_mmap ,
12861332 };
12871333
@@ -1565,7 +1611,7 @@ static struct ggml_cgraph * llama_build_graph(
15651611
15661612 struct ggml_init_params params = {
15671613 /* .mem_size =*/ buf_compute.size ,
1568- /* .mem_buffer =*/ buf_compute.addr ,
1614+ /* .mem_buffer =*/ buf_compute.data ,
15691615 /* .no_alloc =*/ false ,
15701616 };
15711617
@@ -3012,11 +3058,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
30123058// quantization
30133059//
30143060
3015- static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
3016- if (output.size < nelements * sizeof ( float ) ) {
3017- output.resize (nelements * sizeof ( float ) );
3061+ static void llama_convert_tensor_internal (const gguf_load_tensor & tensor, std::vector< float > & output, const size_t nelements, const int nthread) {
3062+ if (output.size () < nelements) {
3063+ output.resize (nelements);
30183064 }
3019- float * f32_output = (float *) output.addr ;
3065+ float * f32_output = (float *) output.data () ;
30203066
30213067 ggml_type_traits_t qtype;
30223068 if (ggml_is_quantized (tensor.type )) {
@@ -3134,10 +3180,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31343180 };
31353181
31363182 size_t idx = 0 ;
3183+
3184+ std::vector<uint8_t > read_data;
3185+ std::vector<uint8_t > work;
3186+
31373187 for (gguf_load_tensor & tensor : model_loader->tensors_map .tensors ) {
3138- gguf_buffer read_data;
31393188 read_data.resize (tensor.size );
3140- tensor.data = read_data.addr ;
3189+ tensor.data = read_data.data () ;
31413190 model_loader->load_data_for (tensor);
31423191
31433192 LLAMA_LOG_INFO (" [%4zu/%4zu] %36s - %16s, type = %6s, " ,
@@ -3156,7 +3205,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31563205 enum ggml_type new_type;
31573206 void * new_data;
31583207 size_t new_size;
3159- gguf_buffer work;
31603208
31613209 if (!quantize) {
31623210 new_type = tensor.type ;
@@ -3214,35 +3262,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32143262 }
32153263#endif
32163264
3265+ const size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3266+
32173267 float * f32_data;
3218- size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
3219- gguf_buffer f32_conv_buf;
3268+ std::vector<float > f32_conv_buf;
32203269
32213270 if (tensor.type == GGML_TYPE_F32) {
32223271 f32_data = (float *) tensor.data ;
32233272 } else if (ggml_is_quantized (tensor.type ) && !params->allow_requantize ) {
32243273 throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor.type )));
32253274 } else {
32263275 llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
3227- f32_data = (float *) f32_conv_buf.addr ;
3276+ f32_data = (float *) f32_conv_buf.data () ;
32283277 }
32293278
32303279 LLAMA_LOG_INFO (" quantizing to %s .. " , ggml_type_name (new_type));
32313280 fflush (stdout);
32323281
32333282 work.resize (nelements * 4 ); // upper bound on size
3234- new_data = work.addr ;
3283+ new_data = work.data () ;
32353284 std::vector<int64_t > hist_cur (1 << 4 , 0 );
32363285
3237- int chunk_size = 32 * 512 ;
3286+ const int chunk_size = 32 * 512 ;
32383287 const int nchunk = (nelements + chunk_size - 1 )/chunk_size;
32393288 const int nthread_use = nthread > 1 ? std::max (1 , std::min (nthread, nchunk)) : 1 ;
32403289 if (nthread_use < 2 ) {
32413290 new_size = ggml_quantize_chunk (new_type, f32_data, new_data, 0 , nelements, hist_cur.data ());
32423291 } else {
32433292 size_t counter = 0 ;
32443293 new_size = 0 ;
3245- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size ] () {
3294+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
32463295 std::vector<int64_t > local_hist;
32473296 size_t local_size = 0 ;
32483297 while (true ) {
@@ -3315,8 +3364,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
33153364 }
33163365}
33173366
3318-
3319-
33203367//
33213368// interface implementation
33223369//
@@ -3438,7 +3485,7 @@ struct llama_context * llama_new_context_with_model(
34383485 ggml_allocr_free (ctx->alloc );
34393486
34403487 ctx->buf_alloc .resize (alloc_size);
3441- ctx->alloc = ggml_allocr_new (ctx->buf_alloc .addr , ctx->buf_alloc .size , tensor_alignment);
3488+ ctx->alloc = ggml_allocr_new (ctx->buf_alloc .data , ctx->buf_alloc .size , tensor_alignment);
34423489 }
34433490#else
34443491 ctx->buf_compute .resize (MEM_REQ_EVAL ().at (ctx->model .type ) + ggml_graph_overhead ());
@@ -3479,11 +3526,11 @@ struct llama_context * llama_new_context_with_model(
34793526
34803527 LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " data" , data_ptr, data_size, max_size));
34813528
3482- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .addr , ctx->buf_compute .size , 0 ));
3483- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->kv_self .buf .addr , ctx->kv_self .buf .size , 0 ));
3529+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " eval" , ctx->buf_compute .data , ctx->buf_compute .size , 0 ));
3530+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " kv" , ctx->kv_self .buf .data , ctx->kv_self .buf .size , 0 ));
34843531
3485- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].addr , ctx->buf_scratch [0 ].size , 0 ));
3486- LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].addr , ctx->buf_scratch [1 ].size , 0 ));
3532+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr0" , ctx->buf_scratch [0 ].data , ctx->buf_scratch [0 ].size , 0 ));
3533+ LLAMA_METAL_CHECK_BUF (ggml_metal_add_buffer (ctx->ctx_metal , " scr1" , ctx->buf_scratch [1 ].data , ctx->buf_scratch [1 ].size , 0 ));
34873534#undef LLAMA_METAL_CHECK_BUF
34883535 }
34893536#endif
@@ -3565,7 +3612,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35653612
35663613 LLAMA_LOG_INFO (" %s: r = %d, alpha = %d, scaling = %.2f\n " , __func__, lora_r, lora_alpha, scaling);
35673614
3568-
35693615 // create a temporary ggml context to store the lora tensors
35703616 // todo: calculate size from biggest possible tensor
35713617 std::vector<uint8_t > lora_buf (1024ull * 1024ull * 1024ull );
@@ -3583,11 +3629,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35833629 model_tensors.insert (kv);
35843630 }
35853631
3586-
35873632 // load base model
35883633 std::unique_ptr<llama_model_loader> model_loader;
35893634 ggml_context * base_ctx = NULL ;
3590- gguf_buffer base_buf;
3635+ std::vector< uint8_t > base_buf;
35913636 if (path_base_model) {
35923637 LLAMA_LOG_INFO (" %s: loading base model from '%s'\n " , __func__, path_base_model);
35933638 model_loader.reset (new llama_model_loader (path_base_model, /* use_mmap*/ true ));
@@ -3598,8 +3643,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35983643 base_buf.resize (ctx_size);
35993644
36003645 ggml_init_params base_params;
3601- base_params.mem_size = base_buf.size ;
3602- base_params.mem_buffer = base_buf.addr ;
3646+ base_params.mem_size = base_buf.size () ;
3647+ base_params.mem_buffer = base_buf.data () ;
36033648 base_params.no_alloc = model_loader->use_mmap ;
36043649
36053650 base_ctx = ggml_init (base_params);
0 commit comments