@@ -815,6 +815,21 @@ struct llama_mmap {
815
815
816
816
llama_mmap (const llama_mmap &) = delete ;
817
817
818
+ static void align_offset (size_t & offset, size_t & len, size_t page_size) {
819
+ // align offset to the next page
820
+ size_t offset_in_page = offset & (page_size - 1 );
821
+ size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
822
+ offset += offset_to_page;
823
+
824
+ if (offset_to_page >= len) {
825
+ len = 0 ;
826
+ } else {
827
+ len -= offset_to_page;
828
+ // align len to the previous page
829
+ len -= len & (page_size - 1 );
830
+ }
831
+ }
832
+
818
833
#ifdef _POSIX_MAPPED_FILES
819
834
static constexpr bool SUPPORTED = true ;
820
835
@@ -849,6 +864,24 @@ struct llama_mmap {
849
864
}
850
865
}
851
866
867
+ void unmap (size_t offset, size_t len) {
868
+ int page_size = sysconf (_SC_PAGESIZE);
869
+ align_offset (offset, len, page_size);
870
+ if (len < (size_t )page_size) {
871
+ return ;
872
+ }
873
+
874
+ void * next_page_start = (uint8_t *) addr + offset;
875
+ // unmap and discard the pages
876
+ if (munmap (next_page_start, len)) {
877
+ fprintf (stderr, " warning: munmap failed: %s\n " , strerror (errno));
878
+ }
879
+ if (posix_madvise (next_page_start, len, POSIX_MADV_DONTNEED)) {
880
+ fprintf (stderr, " warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n " ,
881
+ strerror (errno));
882
+ }
883
+ }
884
+
852
885
~llama_mmap () {
853
886
munmap (addr, size);
854
887
}
@@ -898,6 +931,20 @@ struct llama_mmap {
898
931
}
899
932
}
900
933
934
+ void unmap (size_t offset, size_t len) {
935
+ SYSTEM_INFO si;
936
+ GetSystemInfo (&si);
937
+ DWORD page_size = si.dwAllocationGranularity ;
938
+ align_offset (offset, len, page_size);
939
+
940
+ if (len < (size_t )page_size) {
941
+ return ;
942
+ }
943
+
944
+ void * next_page_start = (uint8_t *) addr + offset;
945
+ VirtualAlloc (next_page_start, len, MEM_RESET, PAGE_NOACCESS);
946
+ }
947
+
901
948
~llama_mmap () {
902
949
if (!UnmapViewOfFile (addr)) {
903
950
fprintf (stderr, " warning: UnmapViewOfFile failed: %s\n " ,
@@ -914,6 +961,13 @@ struct llama_mmap {
914
961
915
962
throw std::runtime_error (std::string (" mmap not supported" ));
916
963
}
964
+
965
+ void unmap (size_t offset, size_t len) {
966
+ (void ) offset;
967
+ (void ) len;
968
+
969
+ throw std::runtime_error (std::string (" mmap not supported" ));
970
+ }
917
971
#endif
918
972
};
919
973
@@ -2243,7 +2297,9 @@ struct llama_model_loader {
2243
2297
return gguf_get_data_offset (ctx_gguf) + gguf_get_tensor_offset (ctx_gguf, idx);
2244
2298
}
2245
2299
2246
- void init_mapping (struct ggml_context * ctx) {
2300
+ void init_mapping () {
2301
+ /*
2302
+ // prefetch only CPU tensors
2247
2303
if (use_mmap) {
2248
2304
size_t size_pref = 0; // prefetch
2249
2305
@@ -2256,6 +2312,9 @@ struct llama_model_loader {
2256
2312
}
2257
2313
mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2258
2314
}
2315
+ */
2316
+ // prefetch the whole file - all the data is needed anyway
2317
+ mapping.reset (new llama_mmap (&file, -1 , ggml_is_numa ()));
2259
2318
}
2260
2319
2261
2320
// for backwards compatibility only
@@ -2292,19 +2351,25 @@ struct llama_model_loader {
2292
2351
2293
2352
std::vector<no_init<uint8_t >> read_buf;
2294
2353
2295
- size_t done_size = 0 ;
2354
+ size_t size_done = 0 ;
2355
+
2356
+ size_t mmap_first = -1 ;
2357
+ size_t mmap_last = 0 ;
2358
+
2296
2359
for (int i = 0 ; i < gguf_get_n_tensors (ctx_gguf); i++) {
2297
2360
struct ggml_tensor * cur = ggml_get_tensor (ctx, gguf_get_tensor_name (ctx_gguf, i));
2298
2361
GGML_ASSERT (cur); // unused tensors should have been caught by load_data already
2299
2362
const size_t offs = file_offset (ggml_get_name (cur));
2300
2363
2301
2364
if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2302
- if (use_mmap) {
2365
+ if (use_mmap && mapping ) {
2303
2366
if (buf_mmap) {
2304
2367
ggml_backend_tensor_alloc (buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2305
2368
if (lmlock) {
2306
2369
lmlock->grow_to (offs + ggml_nbytes (cur));
2307
2370
}
2371
+ mmap_first = std::min (mmap_first, offs);
2372
+ mmap_last = std::max (mmap_last, offs + ggml_nbytes (cur));
2308
2373
} else {
2309
2374
ggml_backend_tensor_set (cur, (uint8_t *) mapping->addr + offs, 0 , ggml_nbytes (cur));
2310
2375
}
@@ -2323,7 +2388,7 @@ struct llama_model_loader {
2323
2388
// HACK: mark tensor as allocated
2324
2389
cur->data = (void *)(uintptr_t )1 ;
2325
2390
void * data;
2326
- if (use_mmap) {
2391
+ if (use_mmap && mapping ) {
2327
2392
data = (uint8_t *) mapping->addr + offs;
2328
2393
} else {
2329
2394
read_buf.resize (ggml_nbytes (cur));
@@ -2343,14 +2408,19 @@ struct llama_model_loader {
2343
2408
#endif
2344
2409
}
2345
2410
2346
- done_size += ggml_nbytes (cur);
2411
+ size_done += ggml_nbytes (cur);
2347
2412
2348
2413
if (progress_callback) {
2349
- progress_callback ((float ) done_size / size_data, progress_callback_user_data);
2414
+ progress_callback ((float ) size_done / size_data, progress_callback_user_data);
2350
2415
}
2351
2416
}
2352
2417
2353
- // TODO: unmap GPU tensors
2418
+ // unmap GPU tensors
2419
+ if (use_mmap && mapping) {
2420
+ // unmap offloaded tensors and metadata
2421
+ mapping->unmap (0 , mmap_first);
2422
+ mapping->unmap (mmap_last, mapping->size - mmap_last);
2423
+ }
2354
2424
}
2355
2425
};
2356
2426
@@ -3507,7 +3577,7 @@ static void llm_load_tensors(
3507
3577
3508
3578
ml.done_getting_tensors ();
3509
3579
3510
- ml.init_mapping (ctx );
3580
+ ml.init_mapping ();
3511
3581
3512
3582
// allocate tensors
3513
3583
size_t vram_weights = 0 ;
0 commit comments