Skip to content

Commit b47b8a9

Browse files
authored
llama : optimize memory buffers (#2325)
1 parent b5fe67f commit b47b8a9

File tree

3 files changed

+66
-73
lines changed

3 files changed

+66
-73
lines changed

examples/common.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -578,18 +578,18 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
578578
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
579579
auto lparams = llama_context_default_params();
580580

581-
lparams.n_ctx = params.n_ctx;
582-
lparams.n_batch = params.n_batch;
583-
lparams.n_gpu_layers = params.n_gpu_layers;
584-
lparams.main_gpu = params.main_gpu;
585-
lparams.tensor_split = params.tensor_split;
586-
lparams.low_vram = params.low_vram;
587-
lparams.seed = params.seed;
588-
lparams.f16_kv = params.memory_f16;
589-
lparams.use_mmap = params.use_mmap;
590-
lparams.use_mlock = params.use_mlock;
591-
lparams.logits_all = params.perplexity;
592-
lparams.embedding = params.embedding;
581+
lparams.n_ctx = params.n_ctx;
582+
lparams.n_batch = params.n_batch;
583+
lparams.n_gpu_layers = params.n_gpu_layers;
584+
lparams.main_gpu = params.main_gpu;
585+
lparams.tensor_split = params.tensor_split;
586+
lparams.low_vram = params.low_vram;
587+
lparams.seed = params.seed;
588+
lparams.f16_kv = params.memory_f16;
589+
lparams.use_mmap = params.use_mmap;
590+
lparams.use_mlock = params.use_mlock;
591+
lparams.logits_all = params.perplexity;
592+
lparams.embedding = params.embedding;
593593
lparams.rope_freq_base = params.rope_freq_base;
594594
lparams.rope_freq_scale = params.rope_freq_scale;
595595

examples/main/main.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,17 +139,14 @@ int main(int argc, char ** argv) {
139139
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
140140
}
141141

142-
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
142+
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
143143
// uncomment the "used_mem" line in llama.cpp to see the results
144144
if (params.mem_test) {
145145
{
146-
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
147-
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
148-
}
146+
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
149147

150-
{
151-
const std::vector<llama_token> tmp = { 0, };
152-
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
148+
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
149+
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
153150
}
154151

155152
llama_print_timings(ctx);

llama.cpp

Lines changed: 50 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -98,57 +98,42 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
9898
}
9999

100100
//
101-
// memory sizes
101+
// memory sizes (calculated for n_batch == 512)
102102
//
103103

104104
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
105105
{
106106
static std::map<e_model, size_t> k_sizes = {
107-
/* empirical scaling, still a guess */
108-
{ MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB },
109-
{ MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB },
110-
{ MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB },
111-
{ MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB },
112-
{ MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB },
107+
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
108+
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
109+
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
110+
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
111+
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
113112
};
114113
return k_sizes;
115114
}
116115

117116
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
118117
{
119118
static std::map<e_model, size_t> k_sizes = {
120-
{ MODEL_3B, 256ull * MB },
121-
{ MODEL_7B, 512ull * MB },
122-
{ MODEL_13B, 512ull * MB },
123-
{ MODEL_30B, 512ull * MB },
124-
{ MODEL_65B, 1024ull * MB },
119+
{ MODEL_3B, 128ull * MB },
120+
{ MODEL_7B, 160ull * MB },
121+
{ MODEL_13B, 192ull * MB },
122+
{ MODEL_30B, 256ull * MB },
123+
{ MODEL_65B, 384ull * MB }, // guess
125124
};
126125
return k_sizes;
127126
}
128127

129-
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
130-
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
128+
// used to store the compute graph tensors + non-scratch data
129+
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
131130
{
132131
static std::map<e_model, size_t> k_sizes = {
133-
{ MODEL_3B, 682ull * MB },
134-
{ MODEL_7B, 1026ull * MB },
135-
{ MODEL_13B, 1608ull * MB },
136-
{ MODEL_30B, 3124ull * MB },
137-
{ MODEL_65B, 5120ull * MB },
138-
};
139-
return k_sizes;
140-
}
141-
142-
// this is mostly needed for temporary mul_mat buffers to dequantize the data
143-
// not actually needed if BLAS is disabled
144-
static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
145-
{
146-
static std::map<e_model, size_t> k_sizes = {
147-
{ MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB },
148-
{ MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB },
149-
{ MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
150-
{ MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
151-
{ MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
132+
{ MODEL_3B, 8ull * MB },
133+
{ MODEL_7B, 10ull * MB },
134+
{ MODEL_13B, 12ull * MB },
135+
{ MODEL_30B, 16ull * MB },
136+
{ MODEL_65B, 24ull * MB }, // guess
152137
};
153138
return k_sizes;
154139
}
@@ -199,6 +184,15 @@ struct llama_hparams {
199184
bool operator!=(const llama_hparams & other) const {
200185
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
201186
}
187+
188+
size_t kv_size() const {
189+
size_t result = 2ull;
190+
result *= (size_t) n_embd;
191+
result *= (size_t) n_ctx;
192+
result *= (size_t) n_layer;
193+
result *= sizeof(ggml_fp16_t);
194+
return result;
195+
}
202196
};
203197

204198
struct llama_layer {
@@ -1069,7 +1063,7 @@ static void llama_model_load_internal(
10691063
{
10701064
model.buf.resize(ctx_size);
10711065
if (use_mlock) {
1072-
model.mlock_buf.init(model.buf.addr);
1066+
model.mlock_buf.init (model.buf.addr);
10731067
model.mlock_buf.grow_to(model.buf.size);
10741068
}
10751069

@@ -1186,11 +1180,11 @@ static void llama_model_load_internal(
11861180
mmapped_size - vram_weights + // weights in VRAM not in memory
11871181
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
11881182
MEM_REQ_SCRATCH1().at(model.type) +
1189-
MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
1183+
MEM_REQ_EVAL().at(model.type);
11901184

11911185
// this is the memory required by one llama_state
11921186
const size_t mem_required_state =
1193-
scale*MEM_REQ_KV_SELF().at(model.type);
1187+
scale*hparams.kv_size();
11941188

11951189
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
11961190
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1231,15 +1225,15 @@ static void llama_model_load_internal(
12311225
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
12321226
} else {
12331227
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
1234-
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1228+
vram_kv_cache += hparams.kv_size() / 2;
12351229
}
12361230
}
12371231
if (n_gpu_layers > (int) hparams.n_layer + 2) {
12381232
if (low_vram) {
12391233
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
12401234
} else {
12411235
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
1242-
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
1236+
vram_kv_cache += hparams.kv_size() / 2;
12431237
}
12441238
}
12451239
#elif defined(GGML_USE_CLBLAST)
@@ -1739,10 +1733,12 @@ static bool llama_eval_internal(
17391733
}
17401734

17411735
#if 0
1742-
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1736+
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
17431737
ggml_used_mem(ctx0)/1024.0/1024.0,
17441738
lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745-
lctx.get_buf_max_mem(1)/1024.0/1024.0);
1739+
lctx.get_buf_max_mem(1)/1024.0/1024.0,
1740+
lctx.work_buffer.size()/1024.0/1024.0,
1741+
n_past, N);
17461742
#endif
17471743

17481744
ggml_free(ctx0);
@@ -2448,8 +2444,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
24482444
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
24492445
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
24502446
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
2451-
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2452-
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
2447+
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
2448+
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
24532449

24542450
#ifdef GGML_USE_K_QUANTS
24552451
// K-quants
@@ -2533,16 +2529,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
25332529
} else {
25342530
new_type = quantized_type;
25352531
#ifdef GGML_USE_K_QUANTS
2536-
bool convert_incompatible_tensor = false;
2537-
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538-
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539-
int nx = tensor.ne.at(0);
2540-
int ny = tensor.ne.at(1);
2541-
if (nx % QK_K != 0 || ny % QK_K != 0) {
2542-
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2543-
convert_incompatible_tensor = true;
2544-
}
2545-
}
25462532
if (tensor.name == "output.weight") {
25472533
int nx = tensor.ne.at(0);
25482534
int ny = tensor.ne.at(1);
@@ -2568,6 +2554,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
25682554
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
25692555
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
25702556
}
2557+
bool convert_incompatible_tensor = false;
2558+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2559+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2560+
int nx = tensor.ne.at(0);
2561+
int ny = tensor.ne.at(1);
2562+
if (nx % QK_K != 0 || ny % QK_K != 0) {
2563+
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
2564+
convert_incompatible_tensor = true;
2565+
}
2566+
}
25712567
if (convert_incompatible_tensor) {
25722568
if (tensor.name == "output.weight") {
25732569
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
@@ -2594,7 +2590,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
25942590
f32_data = (float *) f32_conv_buf.addr;
25952591
}
25962592

2597-
printf("quantizing .. ");
2593+
printf("quantizing to %s .. ", ggml_type_name(new_type));
25982594
fflush(stdout);
25992595

26002596
work.resize(nelements * 4); // upper bound on size
@@ -2775,7 +2771,7 @@ struct llama_context * llama_new_context_with_model(
27752771
ctx->embedding.resize(hparams.n_embd);
27762772
}
27772773

2778-
ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
2774+
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
27792775

27802776
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
27812777
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));

0 commit comments

Comments
 (0)