Skip to content

Commit b684502

Browse files
committed
fix unmap after loading
1 parent 2f3e30d commit b684502

File tree

1 file changed

+37
-47
lines changed

1 file changed

+37
-47
lines changed

llama.cpp

Lines changed: 37 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -2401,7 +2401,7 @@ struct llama_model_loader {
24012401
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
24022402
}
24032403

2404-
void init_mapping(bool prefetch = true) {
2404+
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
24052405
/*
24062406
// prefetch only CPU tensors
24072407
if (use_mmap) {
@@ -2421,6 +2421,18 @@ struct llama_model_loader {
24212421
if (use_mmap) {
24222422
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
24232423
}
2424+
2425+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2426+
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2427+
size_data += ggml_nbytes(cur);
2428+
}
2429+
2430+
if (use_mmap && mapping) {
2431+
if (lmlock) {
2432+
lmlock->init(mapping->addr);
2433+
}
2434+
mmap_used_first = mapping->size;
2435+
}
24242436
}
24252437

24262438
// for backwards compatibility, does not support ggml-backend
@@ -2439,29 +2451,15 @@ struct llama_model_loader {
24392451

24402452
size_t size_done = 0;
24412453
size_t size_data = 0;
2442-
size_t mmap_first = -1;
2443-
size_t mmap_last = 0;
2454+
size_t mmap_used_first = -1;
2455+
size_t mmap_used_last = 0;
24442456

24452457
// Returns false if cancelled by progress_callback
24462458
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
2447-
// TODO: move to a better place
2448-
if (size_data == 0) {
2449-
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2450-
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2451-
size_data += ggml_nbytes(cur);
2452-
}
2453-
2454-
if (use_mmap && buf_mmap) {
2455-
// FIXME
2456-
//if (lmlock) {
2457-
// lmlock->init(mapping->addr);
2458-
//}
2459-
}
2460-
}
2459+
GGML_ASSERT(size_data != 0 && "call init_mapping() first");
24612460

24622461
std::vector<no_init<uint8_t>> read_buf;
24632462

2464-
24652463
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
24662464
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
24672465
if (!cur) {
@@ -2477,15 +2475,14 @@ struct llama_model_loader {
24772475

24782476
const size_t offs = file_offset(ggml_get_name(cur));
24792477

2480-
// FIXME
24812478
if (use_mmap && mapping) {
24822479
if (buf_mmap && cur->data == nullptr) {
24832480
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
24842481
if (lmlock) {
24852482
lmlock->grow_to(offs + ggml_nbytes(cur));
24862483
}
2487-
mmap_first = std::min(mmap_first, offs);
2488-
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2484+
mmap_used_first = std::min(mmap_used_first, offs);
2485+
mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
24892486
} else {
24902487
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
24912488
}
@@ -2504,20 +2501,23 @@ struct llama_model_loader {
25042501
size_done += ggml_nbytes(cur);
25052502
}
25062503

2507-
if (progress_callback && size_done >= size_data) {
2508-
// Even though the model is done loading, we still honor
2509-
// cancellation since we need to free allocations.
2510-
return progress_callback(1.0f, progress_callback_user_data);
2504+
// check if this is the last call and do final cleanup
2505+
if (size_done >= size_data) {
2506+
// unmap offloaded tensors and metadata
2507+
if (use_mmap && mapping) {
2508+
mapping->unmap_fragment(0, mmap_used_first);
2509+
if (mmap_used_last != 0) {
2510+
mapping->unmap_fragment(mmap_used_last, mapping->size);
2511+
}
2512+
}
2513+
if (progress_callback) {
2514+
// Even though the model is done loading, we still honor
2515+
// cancellation since we need to free allocations.
2516+
return progress_callback(1.0f, progress_callback_user_data);
2517+
}
25112518
}
2512-
return true;
2513-
}
25142519

2515-
void unmap_fragments() {
2516-
// unmap offloaded tensors and metadata
2517-
if (use_mmap && mapping) {
2518-
mapping->unmap_fragment(0, mmap_first);
2519-
mapping->unmap_fragment(mmap_last, mapping->size);
2520-
}
2520+
return true;
25212521
}
25222522
};
25232523

@@ -3700,16 +3700,7 @@ static bool llm_load_tensors(
37003700

37013701
ml.done_getting_tensors();
37023702

3703-
ml.init_mapping();
3704-
3705-
// TODO: move to ggml
3706-
//auto ggml_n_tensors = [](struct ggml_context * ctx) {
3707-
// int n = 0;
3708-
// for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3709-
// ++n;
3710-
// }
3711-
// return n;
3712-
//};
3703+
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
37133704

37143705
// create backend buffers
37153706

@@ -3720,9 +3711,9 @@ static bool llm_load_tensors(
37203711
ggml_context * ctx = it.second;
37213712
ggml_backend_buffer_t buf = nullptr;
37223713

3723-
// TODO: do not use whole model mapping for the buffer, only the region containing the tensors
3724-
// this is important for metal: if the entire model could be mapped, then we could use metal for all layers
3725-
if (ml.use_mmap && buft == ggml_backend_cpu_buffer_type()) {
3714+
// TODO: do not use the whole model mapping for the buffer, only the region containing the tensors
3715+
// this is important for metal: if the entire model could be mapped to a metal buffer, then we could use metal for all layers
3716+
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
37263717
buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
37273718
}
37283719
#ifdef GGML_USE_METAL
@@ -3780,7 +3771,6 @@ static bool llm_load_tensors(
37803771
return false;
37813772
}
37823773
}
3783-
ml.unmap_fragments();
37843774

37853775
model.mapping = std::move(ml.mapping);
37863776

0 commit comments

Comments
 (0)