@@ -886,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
886
886
return result;
887
887
}
888
888
889
+ struct llama_model_quantize_params llama_model_quantize_default_params () {
890
+ struct llama_model_quantize_params result = {
891
+ /* .nthread =*/ 0 ,
892
+ /* .ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
893
+ /* .allow_requantize =*/ false ,
894
+ /* .quantize_output_tensor =*/ true ,
895
+ };
896
+
897
+ return result;
898
+ }
899
+
889
900
bool llama_mmap_supported () {
890
901
return llama_mmap::SUPPORTED;
891
902
}
@@ -2231,9 +2242,70 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
2231
2242
// quantization
2232
2243
//
2233
2244
2234
- static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
2245
+ static void llama_convert_tensor_internal (const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
2246
+ if (output.size < nelements * sizeof (float )) {
2247
+ output.resize (nelements * sizeof (float ));
2248
+ }
2249
+ float * f32_output = (float *) output.addr ;
2250
+
2251
+ quantize_fns_t qtype;
2252
+ if (ggml_is_quantized (tensor.type )) {
2253
+ qtype = ggml_internal_get_quantize_fn (tensor.type );
2254
+ if (qtype.dequantize_row_q == NULL ) {
2255
+ throw std::runtime_error (format (" type %s unsupported for integer quantization: no dequantization available" , ggml_type_name (tensor.type )));
2256
+ }
2257
+ } else if (tensor.type != GGML_TYPE_F16) {
2258
+ throw std::runtime_error (format (" cannot dequantize/convert tensor type %s" , ggml_type_name (tensor.type )));
2259
+ }
2260
+
2261
+ if (nthread < 2 ) {
2262
+ if (tensor.type == GGML_TYPE_F16) {
2263
+ ggml_fp16_to_fp32_row ((ggml_fp16_t *)tensor.data , f32_output, nelements);
2264
+ } else if (ggml_is_quantized (tensor.type )) {
2265
+ qtype.dequantize_row_q (tensor.data , f32_output, nelements);
2266
+ } else {
2267
+ LLAMA_ASSERT (false ); // unreachable
2268
+ }
2269
+ return ;
2270
+ }
2271
+
2272
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t )ggml_blck_size (tensor.type );
2273
+ auto block_size_bytes = ggml_type_size (tensor.type );
2274
+
2275
+ LLAMA_ASSERT (nelements % block_size == 0 );
2276
+ auto nblocks = nelements / block_size;
2277
+ auto blocks_per_thread = nblocks / nthread;
2278
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
2279
+
2280
+ std::vector<std::thread> workers;
2281
+ for (auto tnum = 0 , in_buff_offs = 0 , out_buff_offs = 0 ; tnum < nthread; tnum++) {
2282
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0 ); // num blocks for this thread
2283
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
2284
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
2285
+
2286
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
2287
+ if (typ == GGML_TYPE_F16) {
2288
+ ggml_fp16_to_fp32_row ((ggml_fp16_t *)inbuf, outbuf, nels);
2289
+ } else {
2290
+ qtype.dequantize_row_q (inbuf, outbuf, nels);
2291
+ }
2292
+ };
2293
+ workers.push_back (std::thread (compute, tensor.type , tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
2294
+ in_buff_offs += thr_block_bytes;
2295
+ out_buff_offs += thr_elems;
2296
+ }
2297
+ for (auto & worker : workers) {
2298
+ worker.join ();
2299
+ }
2300
+
2301
+ }
2302
+
2303
+ static void llama_model_quantize_internal (const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
2235
2304
ggml_type quantized_type;
2236
- switch (ftype) {
2305
+ llama_ftype ftype = params->ftype ;
2306
+ int nthread = params->nthread ;
2307
+
2308
+ switch (params->ftype ) {
2237
2309
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break ;
2238
2310
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break ;
2239
2311
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break ;
@@ -2259,7 +2331,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2259
2331
2260
2332
std::unique_ptr<llama_model_loader> model_loader (new llama_model_loader (fname_inp, /* use_mmap*/ false ,
2261
2333
/* vocab_only*/ false ));
2262
- llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), ftype);
2334
+ llama_file_saver file_saver (fname_out.c_str (), model_loader->file_loaders .at (0 ).get (), params-> ftype );
2263
2335
2264
2336
int n_attention_wv = 0 ;
2265
2337
int n_feed_forward_w2 = 0 ;
@@ -2301,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2301
2373
quantize &= (tensor.ne .size () == 2 );
2302
2374
2303
2375
// uncomment this to keep the output layer in FP16
2304
- // if (tensor.name == "output.weight") {
2305
- // quantize = false;
2306
- // }
2376
+ if (!params->quantize_output_tensor && tensor.name == " output.weight" ) {
2377
+ quantize = false ;
2378
+ }
2379
+ quantize = quantize && quantized_type != tensor.type ;
2307
2380
2308
2381
enum ggml_type new_type;
2309
2382
void * new_data;
@@ -2346,17 +2419,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2346
2419
float * f32_data;
2347
2420
size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
2348
2421
llama_buffer f32_conv_buf;
2422
+
2349
2423
if (tensor.type == GGML_TYPE_F32) {
2350
2424
f32_data = (float *) tensor.data ;
2351
- } else if (tensor.type == GGML_TYPE_F16) {
2352
- f32_conv_buf.resize (nelements * sizeof (float ));
2353
- f32_data = (float *) f32_conv_buf.addr ;
2354
- const auto * f16_data = (const ggml_fp16_t *) tensor.data ;
2355
- for (size_t i = 0 ; i < nelements; i++) {
2356
- f32_data[i] = ggml_fp16_to_fp32 (f16_data[i]);
2357
- }
2425
+ } else if (ggml_is_quantized (tensor.type ) && !params->allow_requantize ) {
2426
+ throw std::runtime_error (format (" requantizing from type %s is disabled" , ggml_type_name (tensor.type )));
2358
2427
} else {
2359
- throw std::runtime_error (format (" type %s unsupported for integer quantization" , ggml_type_name (tensor.type )));
2428
+ llama_convert_tensor_internal (tensor, f32_conv_buf, nelements, nthread);
2429
+ f32_data = (float *) f32_conv_buf.addr ;
2360
2430
}
2361
2431
2362
2432
printf (" quantizing .. " );
@@ -2566,10 +2636,9 @@ void llama_free(struct llama_context * ctx) {
2566
2636
int llama_model_quantize (
2567
2637
const char * fname_inp,
2568
2638
const char * fname_out,
2569
- enum llama_ftype ftype,
2570
- int nthread) {
2639
+ const llama_model_quantize_params *params) {
2571
2640
try {
2572
- llama_model_quantize_internal (fname_inp, fname_out, ftype, nthread );
2641
+ llama_model_quantize_internal (fname_inp, fname_out, params );
2573
2642
return 0 ;
2574
2643
} catch (const std::exception & err) {
2575
2644
fprintf (stderr, " %s: failed to quantize: %s\n " , __func__, err.what ());
0 commit comments