@@ -2401,7 +2401,7 @@ struct llama_model_loader {
2401
2401
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2402
2402
}
2403
2403
2404
- void init_mapping(bool prefetch = true) {
2404
+ void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr ) {
2405
2405
/*
2406
2406
// prefetch only CPU tensors
2407
2407
if (use_mmap) {
@@ -2421,6 +2421,18 @@ struct llama_model_loader {
2421
2421
if (use_mmap) {
2422
2422
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2423
2423
}
2424
+
2425
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2426
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2427
+ size_data += ggml_nbytes(cur);
2428
+ }
2429
+
2430
+ if (use_mmap && mapping) {
2431
+ if (lmlock) {
2432
+ lmlock->init(mapping->addr);
2433
+ }
2434
+ mmap_used_first = mapping->size;
2435
+ }
2424
2436
}
2425
2437
2426
2438
// for backwards compatibility, does not support ggml-backend
@@ -2439,29 +2451,15 @@ struct llama_model_loader {
2439
2451
2440
2452
size_t size_done = 0;
2441
2453
size_t size_data = 0;
2442
- size_t mmap_first = -1;
2443
- size_t mmap_last = 0;
2454
+ size_t mmap_used_first = -1;
2455
+ size_t mmap_used_last = 0;
2444
2456
2445
2457
// Returns false if cancelled by progress_callback
2446
2458
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
2447
- // TODO: move to a better place
2448
- if (size_data == 0) {
2449
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2450
- struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2451
- size_data += ggml_nbytes(cur);
2452
- }
2453
-
2454
- if (use_mmap && buf_mmap) {
2455
- // FIXME
2456
- //if (lmlock) {
2457
- // lmlock->init(mapping->addr);
2458
- //}
2459
- }
2460
- }
2459
+ GGML_ASSERT(size_data != 0 && "call init_mapping() first");
2461
2460
2462
2461
std::vector<no_init<uint8_t>> read_buf;
2463
2462
2464
-
2465
2463
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2466
2464
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2467
2465
if (!cur) {
@@ -2477,15 +2475,14 @@ struct llama_model_loader {
2477
2475
2478
2476
const size_t offs = file_offset(ggml_get_name(cur));
2479
2477
2480
- // FIXME
2481
2478
if (use_mmap && mapping) {
2482
2479
if (buf_mmap && cur->data == nullptr) {
2483
2480
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2484
2481
if (lmlock) {
2485
2482
lmlock->grow_to(offs + ggml_nbytes(cur));
2486
2483
}
2487
- mmap_first = std::min(mmap_first , offs);
2488
- mmap_last = std::max(mmap_last , offs + ggml_nbytes(cur));
2484
+ mmap_used_first = std::min(mmap_used_first , offs);
2485
+ mmap_used_last = std::max(mmap_used_last , offs + ggml_nbytes(cur));
2489
2486
} else {
2490
2487
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2491
2488
}
@@ -2504,20 +2501,23 @@ struct llama_model_loader {
2504
2501
size_done += ggml_nbytes(cur);
2505
2502
}
2506
2503
2507
- if (progress_callback && size_done >= size_data) {
2508
- // Even though the model is done loading, we still honor
2509
- // cancellation since we need to free allocations.
2510
- return progress_callback(1.0f, progress_callback_user_data);
2504
+ // check if this is the last call and do final cleanup
2505
+ if (size_done >= size_data) {
2506
+ // unmap offloaded tensors and metadata
2507
+ if (use_mmap && mapping) {
2508
+ mapping->unmap_fragment(0, mmap_used_first);
2509
+ if (mmap_used_last != 0) {
2510
+ mapping->unmap_fragment(mmap_used_last, mapping->size);
2511
+ }
2512
+ }
2513
+ if (progress_callback) {
2514
+ // Even though the model is done loading, we still honor
2515
+ // cancellation since we need to free allocations.
2516
+ return progress_callback(1.0f, progress_callback_user_data);
2517
+ }
2511
2518
}
2512
- return true;
2513
- }
2514
2519
2515
- void unmap_fragments() {
2516
- // unmap offloaded tensors and metadata
2517
- if (use_mmap && mapping) {
2518
- mapping->unmap_fragment(0, mmap_first);
2519
- mapping->unmap_fragment(mmap_last, mapping->size);
2520
- }
2520
+ return true;
2521
2521
}
2522
2522
};
2523
2523
@@ -3700,16 +3700,7 @@ static bool llm_load_tensors(
3700
3700
3701
3701
ml.done_getting_tensors();
3702
3702
3703
- ml.init_mapping();
3704
-
3705
- // TODO: move to ggml
3706
- //auto ggml_n_tensors = [](struct ggml_context * ctx) {
3707
- // int n = 0;
3708
- // for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3709
- // ++n;
3710
- // }
3711
- // return n;
3712
- //};
3703
+ ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
3713
3704
3714
3705
// create backend buffers
3715
3706
@@ -3720,9 +3711,9 @@ static bool llm_load_tensors(
3720
3711
ggml_context * ctx = it.second;
3721
3712
ggml_backend_buffer_t buf = nullptr;
3722
3713
3723
- // TODO: do not use whole model mapping for the buffer, only the region containing the tensors
3724
- // this is important for metal: if the entire model could be mapped, then we could use metal for all layers
3725
- if (ml.use_mmap && buft == ggml_backend_cpu_buffer_type( )) {
3714
+ // TODO: do not use the whole model mapping for the buffer, only the region containing the tensors
3715
+ // this is important for metal: if the entire model could be mapped to a metal buffer , then we could use metal for all layers
3716
+ if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true )) {
3726
3717
buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3727
3718
}
3728
3719
#ifdef GGML_USE_METAL
@@ -3780,7 +3771,6 @@ static bool llm_load_tensors(
3780
3771
return false;
3781
3772
}
3782
3773
}
3783
- ml.unmap_fragments();
3784
3774
3785
3775
model.mapping = std::move(ml.mapping);
3786
3776
0 commit comments