@@ -98,57 +98,42 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
98
98
}
99
99
100
100
//
101
- // memory sizes
101
+ // memory sizes (calculated for n_batch == 512)
102
102
//
103
103
104
104
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH0 (int n_ctx)
105
105
{
106
106
static std::map<e_model, size_t > k_sizes = {
107
- /* empirical scaling, still a guess */
108
- { MODEL_3B, ((size_t ) n_ctx / 16ull + 128ull ) * MB },
109
- { MODEL_7B, ((size_t ) n_ctx / 16ull + 256ull ) * MB },
110
- { MODEL_13B, ((size_t ) n_ctx / 12ull + 256ull ) * MB },
111
- { MODEL_30B, ((size_t ) n_ctx / 10ull + 256ull ) * MB },
112
- { MODEL_65B, ((size_t ) n_ctx / 8ull + 512ull ) * MB },
107
+ { MODEL_3B, ((size_t ) n_ctx / 16ull + 92ull ) * MB },
108
+ { MODEL_7B, ((size_t ) n_ctx / 16ull + 100ull ) * MB },
109
+ { MODEL_13B, ((size_t ) n_ctx / 12ull + 120ull ) * MB },
110
+ { MODEL_30B, ((size_t ) n_ctx / 9ull + 160ull ) * MB },
111
+ { MODEL_65B, ((size_t ) n_ctx / 6ull + 256ull ) * MB }, // guess
113
112
};
114
113
return k_sizes;
115
114
}
116
115
117
116
static const std::map<e_model, size_t > & MEM_REQ_SCRATCH1 ()
118
117
{
119
118
static std::map<e_model, size_t > k_sizes = {
120
- { MODEL_3B, 256ull * MB },
121
- { MODEL_7B, 512ull * MB },
122
- { MODEL_13B, 512ull * MB },
123
- { MODEL_30B, 512ull * MB },
124
- { MODEL_65B, 1024ull * MB },
119
+ { MODEL_3B, 128ull * MB },
120
+ { MODEL_7B, 160ull * MB },
121
+ { MODEL_13B, 192ull * MB },
122
+ { MODEL_30B, 256ull * MB },
123
+ { MODEL_65B, 384ull * MB }, // guess
125
124
};
126
125
return k_sizes;
127
126
}
128
127
129
- // 2*n_embd*n_ctx*n_layer*sizeof(float16)
130
- static const std::map<e_model, size_t > & MEM_REQ_KV_SELF ()
128
+ // used to store the compute graph tensors + non-scratch data
129
+ static const std::map<e_model, size_t > & MEM_REQ_EVAL ()
131
130
{
132
131
static std::map<e_model, size_t > k_sizes = {
133
- { MODEL_3B, 682ull * MB },
134
- { MODEL_7B, 1026ull * MB },
135
- { MODEL_13B, 1608ull * MB },
136
- { MODEL_30B, 3124ull * MB },
137
- { MODEL_65B, 5120ull * MB },
138
- };
139
- return k_sizes;
140
- }
141
-
142
- // this is mostly needed for temporary mul_mat buffers to dequantize the data
143
- // not actually needed if BLAS is disabled
144
- static const std::map<e_model, size_t > & MEM_REQ_EVAL (int n_ctx)
145
- {
146
- static std::map<e_model, size_t > k_sizes = {
147
- { MODEL_3B, ((size_t ) n_ctx / 256ull + 512ull ) * MB },
148
- { MODEL_7B, ((size_t ) n_ctx / 256ull + 768ull ) * MB },
149
- { MODEL_13B, ((size_t ) n_ctx / 256ull + 1024ull ) * MB },
150
- { MODEL_30B, ((size_t ) n_ctx / 256ull + 1280ull ) * MB },
151
- { MODEL_65B, ((size_t ) n_ctx / 256ull + 1536ull ) * MB },
132
+ { MODEL_3B, 8ull * MB },
133
+ { MODEL_7B, 10ull * MB },
134
+ { MODEL_13B, 12ull * MB },
135
+ { MODEL_30B, 16ull * MB },
136
+ { MODEL_65B, 24ull * MB }, // guess
152
137
};
153
138
return k_sizes;
154
139
}
@@ -199,6 +184,15 @@ struct llama_hparams {
199
184
bool operator !=(const llama_hparams & other) const {
200
185
return static_cast <bool >(memcmp (this , &other, sizeof (llama_hparams)));
201
186
}
187
+
188
+ size_t kv_size () const {
189
+ size_t result = 2ull ;
190
+ result *= (size_t ) n_embd;
191
+ result *= (size_t ) n_ctx;
192
+ result *= (size_t ) n_layer;
193
+ result *= sizeof (ggml_fp16_t );
194
+ return result;
195
+ }
202
196
};
203
197
204
198
struct llama_layer {
@@ -1069,7 +1063,7 @@ static void llama_model_load_internal(
1069
1063
{
1070
1064
model.buf .resize (ctx_size);
1071
1065
if (use_mlock) {
1072
- model.mlock_buf .init (model.buf .addr );
1066
+ model.mlock_buf .init (model.buf .addr );
1073
1067
model.mlock_buf .grow_to (model.buf .size );
1074
1068
}
1075
1069
@@ -1186,11 +1180,11 @@ static void llama_model_load_internal(
1186
1180
mmapped_size - vram_weights + // weights in VRAM not in memory
1187
1181
MEM_REQ_SCRATCH0 (hparams.n_ctx ).at (model.type ) +
1188
1182
MEM_REQ_SCRATCH1 ().at (model.type ) +
1189
- MEM_REQ_EVAL (hparams. n_ctx ).at (model.type );
1183
+ MEM_REQ_EVAL ().at (model.type );
1190
1184
1191
1185
// this is the memory required by one llama_state
1192
1186
const size_t mem_required_state =
1193
- scale*MEM_REQ_KV_SELF (). at (model. type );
1187
+ scale*hparams. kv_size ( );
1194
1188
1195
1189
fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
1196
1190
mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
@@ -1231,15 +1225,15 @@ static void llama_model_load_internal(
1231
1225
fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
1232
1226
} else {
1233
1227
fprintf (stderr, " %s: offloading v cache to GPU\n " , __func__);
1234
- vram_kv_cache += MEM_REQ_KV_SELF (). at (model. type ) / 2 ;
1228
+ vram_kv_cache += hparams. kv_size ( ) / 2 ;
1235
1229
}
1236
1230
}
1237
1231
if (n_gpu_layers > (int ) hparams.n_layer + 2 ) {
1238
1232
if (low_vram) {
1239
1233
fprintf (stderr, " %s: cannot offload k cache to GPU due to low VRAM option\n " , __func__);
1240
1234
} else {
1241
1235
fprintf (stderr, " %s: offloading k cache to GPU\n " , __func__);
1242
- vram_kv_cache += MEM_REQ_KV_SELF (). at (model. type ) / 2 ;
1236
+ vram_kv_cache += hparams. kv_size ( ) / 2 ;
1243
1237
}
1244
1238
}
1245
1239
#elif defined(GGML_USE_CLBLAST)
@@ -1739,10 +1733,12 @@ static bool llama_eval_internal(
1739
1733
}
1740
1734
1741
1735
#if 0
1742
- printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
1736
+ printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d \n", __func__,
1743
1737
ggml_used_mem(ctx0)/1024.0/1024.0,
1744
1738
lctx.get_buf_max_mem(0)/1024.0/1024.0,
1745
- lctx.get_buf_max_mem(1)/1024.0/1024.0);
1739
+ lctx.get_buf_max_mem(1)/1024.0/1024.0,
1740
+ lctx.work_buffer.size()/1024.0/1024.0,
1741
+ n_past, N);
1746
1742
#endif
1747
1743
1748
1744
ggml_free (ctx0);
@@ -2448,8 +2444,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2448
2444
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break ;
2449
2445
case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break ;
2450
2446
case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break ;
2451
- case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break ;
2452
- case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break ;
2447
+ case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break ;
2448
+ case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break ;
2453
2449
2454
2450
#ifdef GGML_USE_K_QUANTS
2455
2451
// K-quants
@@ -2533,16 +2529,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2533
2529
} else {
2534
2530
new_type = quantized_type;
2535
2531
#ifdef GGML_USE_K_QUANTS
2536
- bool convert_incompatible_tensor = false ;
2537
- if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2538
- quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2539
- int nx = tensor.ne .at (0 );
2540
- int ny = tensor.ne .at (1 );
2541
- if (nx % QK_K != 0 || ny % QK_K != 0 ) {
2542
- fprintf (stderr, " \n\n Tensor sizes %d x %d are not divisible by %d, required for k-quants.\n " ,nx,ny,QK_K);
2543
- convert_incompatible_tensor = true ;
2544
- }
2545
- }
2546
2532
if (tensor.name == " output.weight" ) {
2547
2533
int nx = tensor.ne .at (0 );
2548
2534
int ny = tensor.ne .at (1 );
@@ -2568,6 +2554,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2568
2554
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2569
2555
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2570
2556
}
2557
+ bool convert_incompatible_tensor = false ;
2558
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
2559
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
2560
+ int nx = tensor.ne .at (0 );
2561
+ int ny = tensor.ne .at (1 );
2562
+ if (nx % QK_K != 0 || ny % QK_K != 0 ) {
2563
+ fprintf (stderr, " \n\n Tensor sizes %d x %d are not divisible by %d, required for k-quants.\n " ,nx,ny,QK_K);
2564
+ convert_incompatible_tensor = true ;
2565
+ }
2566
+ }
2571
2567
if (convert_incompatible_tensor) {
2572
2568
if (tensor.name == " output.weight" ) {
2573
2569
new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
@@ -2594,7 +2590,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2594
2590
f32_data = (float *) f32_conv_buf.addr ;
2595
2591
}
2596
2592
2597
- printf (" quantizing .. " );
2593
+ printf (" quantizing to %s .. " , ggml_type_name (new_type) );
2598
2594
fflush (stdout);
2599
2595
2600
2596
work.resize (nelements * 4 ); // upper bound on size
@@ -2775,7 +2771,7 @@ struct llama_context * llama_new_context_with_model(
2775
2771
ctx->embedding .resize (hparams.n_embd );
2776
2772
}
2777
2773
2778
- ctx->buf_compute .resize (MEM_REQ_EVAL (hparams. n_ctx ).at (ctx->model .type ));
2774
+ ctx->buf_compute .resize (MEM_REQ_EVAL ().at (ctx->model .type ));
2779
2775
2780
2776
ctx->buf_scratch [0 ].resize (MEM_REQ_SCRATCH0 (hparams.n_ctx ).at (ctx->model .type ));
2781
2777
ctx->buf_scratch [1 ].resize (MEM_REQ_SCRATCH1 ().at (ctx->model .type ));
0 commit comments