12
12
#include < cassert>
13
13
#include < cstring>
14
14
15
+ #include < sys/mman.h>
16
+ #include < fcntl.h>
17
+ #include < unistd.h>
18
+
15
19
#define LLAMA_USE_SCRATCH
16
20
#define LLAMA_MAX_SCRATCH_BUFFERS 16
17
21
@@ -452,36 +456,37 @@ static bool llama_model_load(
452
456
auto & ctx = model.ctx ;
453
457
454
458
size_t ctx_size = 0 ;
455
-
456
459
{
457
460
const auto & hparams = model.hparams ;
458
461
459
462
const int n_embd = hparams.n_embd ;
460
463
const int n_layer = hparams.n_layer ;
461
- const int n_ctx = hparams.n_ctx ;
462
464
const int n_vocab = hparams.n_vocab ;
463
465
464
- ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // tok_embeddings
466
+ if (n_parts > 1 ) {
467
+ ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // tok_embeddings
465
468
466
- ctx_size += n_embd*ggml_type_sizef (GGML_TYPE_F32); // norm
469
+ ctx_size += n_embd*ggml_type_sizef (GGML_TYPE_F32); // norm
467
470
468
- ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // output
471
+ ctx_size += n_embd*n_vocab*ggml_type_sizef (vtype); // output
469
472
470
- ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
473
+ ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // attention_norm
471
474
472
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wq
473
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wk
474
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wv
475
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wo
475
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wq
476
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wk
477
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wv
478
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef (wtype)); // wo
476
479
477
- ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // ffn_norm
480
+ ctx_size += n_layer*(n_embd*ggml_type_sizef (GGML_TYPE_F32)); // ffn_norm
478
481
479
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w1
480
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w2
481
- ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w3
482
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w1
483
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w2
484
+ ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef (wtype)); // w3
485
+ }
482
486
483
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef (memory_type); // memory_k
484
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef (memory_type); // memory_v
487
+ // this is no longer stored in this context
488
+ // ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
489
+ // ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
485
490
486
491
ctx_size += (5 + 10 *n_layer)*256 ; // object overhead
487
492
@@ -533,6 +538,9 @@ static bool llama_model_load(
533
538
534
539
model.layers .resize (n_layer);
535
540
541
+ if (n_parts == 1 )
542
+ ggml_nomem (1 ); // hack to stop ggml from allocating memory for these tensors
543
+
536
544
model.tok_embeddings = ggml_new_tensor_2d (ctx, vtype, n_embd, n_vocab);
537
545
538
546
model.norm = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, n_embd);
@@ -576,6 +584,9 @@ static bool llama_model_load(
576
584
}
577
585
}
578
586
587
+ if (n_parts == 1 )
588
+ ggml_nomem (0 );
589
+
579
590
const size_t file_offset = fin.tellg ();
580
591
581
592
fin.close ();
@@ -600,6 +611,17 @@ static bool llama_model_load(
600
611
fin = std::ifstream (fname_part, std::ios::binary);
601
612
fin.rdbuf ()->pubsetbuf (f_buf.data (), f_buf.size ());
602
613
614
+ // mmap support
615
+ int fd = open (fname.c_str (), O_RDONLY);
616
+ size_t len = lseek (fd, 0 , SEEK_END);
617
+ char * mm = (char *)mmap (NULL , len, PROT_READ, MAP_SHARED, fd, 0 );
618
+ if (mm == MAP_FAILED) {
619
+ perror (" mmap" );
620
+ mm = NULL ;
621
+ }
622
+ close (fd);
623
+ //
624
+
603
625
fin.seekg (0 , fin.end );
604
626
const size_t file_size = fin.tellg ();
605
627
@@ -736,13 +758,23 @@ static bool llama_model_load(
736
758
}
737
759
738
760
if (part_id == 0 ) {
739
- fin.read (reinterpret_cast <char *>(tensor->data ), ggml_nbytes (tensor));
761
+ if (mm == NULL ) {
762
+ fin.read (reinterpret_cast <char *>(tensor->data ), ggml_nbytes (tensor));
763
+ }
764
+ else {
765
+ fprintf (stderr, " tensor mmaped: %s\n " , name.c_str ());
766
+ off_t offset = fin.tellg ();;
767
+ tensor->data = mm + offset;
768
+ fin.seekg (ggml_nbytes (tensor), std::ios::cur);
769
+ }
740
770
} else {
771
+ fprintf (stderr, " tensor skipped: %s\n " , name.c_str ());
741
772
fin.seekg (ggml_nbytes (tensor), std::ios::cur);
742
773
}
743
774
744
- total_size += ggml_nbytes (tensor);
775
+ // total_size += ggml_nbytes(tensor);
745
776
} else {
777
+ fprintf (stderr, " tensor not mmaped: %s\n " , name.c_str ());
746
778
if ((nelements*bpe)/ggml_blck_size (tensor->type ) != ggml_nbytes (tensor)/n_parts) {
747
779
fprintf (stderr, " %s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n " ,
748
780
__func__, name.data (), ggml_nbytes (tensor)/n_parts, nelements*bpe);
0 commit comments