Skip to content

Commit c3678ca

Browse files
committed
unmap offloaded part of the model
1 parent d3e7242 commit c3678ca

File tree

1 file changed

+78
-8
lines changed

1 file changed

+78
-8
lines changed

llama.cpp

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,21 @@ struct llama_mmap {
815815

816816
llama_mmap(const llama_mmap &) = delete;
817817

818+
static void align_offset(size_t & offset, size_t & len, size_t page_size) {
819+
// align offset to the next page
820+
size_t offset_in_page = offset & (page_size - 1);
821+
size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
822+
offset += offset_to_page;
823+
824+
if (offset_to_page >= len) {
825+
len = 0;
826+
} else {
827+
len -= offset_to_page;
828+
// align len to the previous page
829+
len -= len & (page_size - 1);
830+
}
831+
}
832+
818833
#ifdef _POSIX_MAPPED_FILES
819834
static constexpr bool SUPPORTED = true;
820835

@@ -849,6 +864,24 @@ struct llama_mmap {
849864
}
850865
}
851866

867+
void unmap(size_t offset, size_t len) {
868+
int page_size = sysconf(_SC_PAGESIZE);
869+
align_offset(offset, len, page_size);
870+
if (len < (size_t)page_size) {
871+
return;
872+
}
873+
874+
void * next_page_start = (uint8_t *) addr + offset;
875+
// unmap and discard the pages
876+
if (munmap(next_page_start, len)) {
877+
fprintf(stderr, "warning: munmap failed: %s\n", strerror(errno));
878+
}
879+
if (posix_madvise(next_page_start, len, POSIX_MADV_DONTNEED)) {
880+
fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n",
881+
strerror(errno));
882+
}
883+
}
884+
852885
~llama_mmap() {
853886
munmap(addr, size);
854887
}
@@ -898,6 +931,20 @@ struct llama_mmap {
898931
}
899932
}
900933

934+
void unmap(size_t offset, size_t len) {
935+
SYSTEM_INFO si;
936+
GetSystemInfo(&si);
937+
DWORD page_size = si.dwAllocationGranularity;
938+
align_offset(offset, len, page_size);
939+
940+
if (len < (size_t)page_size) {
941+
return;
942+
}
943+
944+
void * next_page_start = (uint8_t *) addr + offset;
945+
VirtualAlloc(next_page_start, len, MEM_RESET, PAGE_NOACCESS);
946+
}
947+
901948
~llama_mmap() {
902949
if (!UnmapViewOfFile(addr)) {
903950
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
@@ -914,6 +961,13 @@ struct llama_mmap {
914961

915962
throw std::runtime_error(std::string("mmap not supported"));
916963
}
964+
965+
void unmap(size_t offset, size_t len) {
966+
(void) offset;
967+
(void) len;
968+
969+
throw std::runtime_error(std::string("mmap not supported"));
970+
}
917971
#endif
918972
};
919973

@@ -2243,7 +2297,9 @@ struct llama_model_loader {
22432297
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
22442298
}
22452299

2246-
void init_mapping(struct ggml_context * ctx) {
2300+
void init_mapping() {
2301+
/*
2302+
// prefetch only CPU tensors
22472303
if (use_mmap) {
22482304
size_t size_pref = 0; // prefetch
22492305
@@ -2256,6 +2312,9 @@ struct llama_model_loader {
22562312
}
22572313
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
22582314
}
2315+
*/
2316+
// prefetch the whole file - all the data is needed anyway
2317+
mapping.reset(new llama_mmap(&file, -1, ggml_is_numa()));
22592318
}
22602319

22612320
// for backwards compatibility only
@@ -2292,19 +2351,25 @@ struct llama_model_loader {
22922351

22932352
std::vector<no_init<uint8_t>> read_buf;
22942353

2295-
size_t done_size = 0;
2354+
size_t size_done = 0;
2355+
2356+
size_t mmap_first = -1;
2357+
size_t mmap_last = 0;
2358+
22962359
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
22972360
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
22982361
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
22992362
const size_t offs = file_offset(ggml_get_name(cur));
23002363

23012364
if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2302-
if (use_mmap) {
2365+
if (use_mmap && mapping) {
23032366
if (buf_mmap) {
23042367
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
23052368
if (lmlock) {
23062369
lmlock->grow_to(offs + ggml_nbytes(cur));
23072370
}
2371+
mmap_first = std::min(mmap_first, offs);
2372+
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
23082373
} else {
23092374
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
23102375
}
@@ -2323,7 +2388,7 @@ struct llama_model_loader {
23232388
// HACK: mark tensor as allocated
23242389
cur->data = (void *)(uintptr_t)1;
23252390
void * data;
2326-
if (use_mmap) {
2391+
if (use_mmap && mapping) {
23272392
data = (uint8_t *) mapping->addr + offs;
23282393
} else {
23292394
read_buf.resize(ggml_nbytes(cur));
@@ -2343,14 +2408,19 @@ struct llama_model_loader {
23432408
#endif
23442409
}
23452410

2346-
done_size += ggml_nbytes(cur);
2411+
size_done += ggml_nbytes(cur);
23472412

23482413
if (progress_callback) {
2349-
progress_callback((float) done_size / size_data, progress_callback_user_data);
2414+
progress_callback((float) size_done / size_data, progress_callback_user_data);
23502415
}
23512416
}
23522417

2353-
// TODO: unmap GPU tensors
2418+
// unmap GPU tensors
2419+
if (use_mmap && mapping) {
2420+
// unmap offloaded tensors and metadata
2421+
mapping->unmap(0, mmap_first);
2422+
mapping->unmap(mmap_last, mapping->size - mmap_last);
2423+
}
23542424
}
23552425
};
23562426

@@ -3507,7 +3577,7 @@ static void llm_load_tensors(
35073577

35083578
ml.done_getting_tensors();
35093579

3510-
ml.init_mapping(ctx);
3580+
ml.init_mapping();
35113581

35123582
// allocate tensors
35133583
size_t vram_weights = 0;

0 commit comments

Comments
 (0)