Skip to content

Commit fc68512

Browse files
committed
mmap hack
1 parent 7e53955 commit fc68512

File tree

3 files changed

+58
-20
lines changed

3 files changed

+58
-20
lines changed

ggml.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2737,6 +2737,10 @@ bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
27372737
#endif // GGML_MLOCK_SUPPORT
27382738

27392739
////////////////////////////////////////////////////////////////////////////////
2740+
int g_nomem = 0;
2741+
void ggml_nomem(int nomem) {
2742+
g_nomem = nomem;
2743+
}
27402744

27412745
struct ggml_tensor * ggml_new_tensor_impl(
27422746
struct ggml_context * ctx,
@@ -2753,7 +2757,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
27532757

27542758
size_t size_needed = 0;
27552759

2756-
if (data == NULL) {
2760+
if (data == NULL && !g_nomem) {
27572761
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
27582762
for (int i = 1; i < n_dims; i++) {
27592763
size_needed *= ne[i];
@@ -2837,7 +2841,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
28372841
/*.perf_runs =*/ 0,
28382842
/*.perf_cycles =*/ 0,
28392843
/*.perf_time_us =*/ 0,
2840-
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
2844+
/*.data =*/ (data == NULL && !g_nomem) ? (void *)(result + 1) : data,
28412845
/*.pad =*/ { 0 },
28422846
};
28432847

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,8 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346346
bool ggml_mlock_supported(void);
347347
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
348348

349+
void ggml_nomem(int nomem);
350+
349351
struct ggml_tensor * ggml_new_tensor(
350352
struct ggml_context * ctx,
351353
enum ggml_type type,

llama.cpp

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
#include <cassert>
1313
#include <cstring>
1414

15+
#include <sys/mman.h>
16+
#include <fcntl.h>
17+
#include <unistd.h>
18+
1519
#define LLAMA_USE_SCRATCH
1620
#define LLAMA_MAX_SCRATCH_BUFFERS 16
1721

@@ -452,36 +456,37 @@ static bool llama_model_load(
452456
auto & ctx = model.ctx;
453457

454458
size_t ctx_size = 0;
455-
456459
{
457460
const auto & hparams = model.hparams;
458461

459462
const int n_embd = hparams.n_embd;
460463
const int n_layer = hparams.n_layer;
461-
const int n_ctx = hparams.n_ctx;
462464
const int n_vocab = hparams.n_vocab;
463465

464-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
466+
if (n_parts > 1) {
467+
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
465468

466-
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
469+
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
467470

468-
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
471+
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
469472

470-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
473+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
471474

472-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
473-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
474-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
475-
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
475+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
476+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
477+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
478+
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
476479

477-
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
480+
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
478481

479-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
480-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
481-
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
482+
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
483+
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
484+
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
485+
}
482486

483-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
484-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
487+
// this is no longer stored in this context
488+
//ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
489+
//ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
485490

486491
ctx_size += (5 + 10*n_layer)*256; // object overhead
487492

@@ -533,6 +538,9 @@ static bool llama_model_load(
533538

534539
model.layers.resize(n_layer);
535540

541+
if (n_parts == 1)
542+
ggml_nomem(1); // hack to stop ggml from allocating memory for these tensors
543+
536544
model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
537545

538546
model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -576,6 +584,9 @@ static bool llama_model_load(
576584
}
577585
}
578586

587+
if (n_parts == 1)
588+
ggml_nomem(0);
589+
579590
const size_t file_offset = fin.tellg();
580591

581592
fin.close();
@@ -600,6 +611,17 @@ static bool llama_model_load(
600611
fin = std::ifstream(fname_part, std::ios::binary);
601612
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
602613

614+
// mmap support
615+
int fd = open(fname.c_str(), O_RDONLY);
616+
size_t len = lseek(fd, 0, SEEK_END);
617+
char* mm = (char*)mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
618+
if (mm == MAP_FAILED) {
619+
perror("mmap");
620+
mm = NULL;
621+
}
622+
close(fd);
623+
//
624+
603625
fin.seekg(0, fin.end);
604626
const size_t file_size = fin.tellg();
605627

@@ -736,13 +758,23 @@ static bool llama_model_load(
736758
}
737759

738760
if (part_id == 0) {
739-
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
761+
if (mm == NULL) {
762+
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
763+
}
764+
else {
765+
fprintf(stderr, "tensor mmaped: %s\n", name.c_str());
766+
off_t offset = fin.tellg();;
767+
tensor->data = mm + offset;
768+
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
769+
}
740770
} else {
771+
fprintf(stderr, "tensor skipped: %s\n", name.c_str());
741772
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
742773
}
743774

744-
total_size += ggml_nbytes(tensor);
775+
//total_size += ggml_nbytes(tensor);
745776
} else {
777+
fprintf(stderr, "tensor not mmaped: %s\n", name.c_str());
746778
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
747779
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
748780
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);

0 commit comments

Comments
 (0)