@@ -15342,20 +15342,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15342
15342
}
15343
15343
}
15344
15344
} else if (name.find("attn_v.weight") != std::string::npos) {
15345
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15346
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15347
- if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
15348
- else {
15349
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
15350
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS;
15351
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
15352
- }
15353
- ++qs.i_attention_wv;
15354
- }
15355
- else if (qs.model.hparams.n_expert >= 4) {
15345
+ if (qs.model.hparams.n_expert >= 4) {
15356
15346
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15357
15347
// TODO: explore better strategies
15358
- new_type = GGML_TYPE_Q8_0;
15348
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
15349
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
15350
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K;
15351
+ else new_type = GGML_TYPE_Q8_0;
15359
15352
}
15360
15353
else if (qs.model.hparams.n_gqa() >= 7) {
15361
15354
// The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
@@ -15365,20 +15358,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15365
15358
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
15366
15359
new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
15367
15360
}
15368
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) {
15369
- new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K ;
15361
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ) {
15362
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS ;
15370
15363
}
15371
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15372
- new_type = GGML_TYPE_Q4_K;
15364
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
15365
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS;
15366
+ }
15367
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
15368
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
15369
+ }
15370
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
15371
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
15373
15372
}
15374
15373
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15375
- new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS ;
15374
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
15376
15375
}
15377
15376
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15378
15377
new_type = GGML_TYPE_Q4_K;
15379
15378
}
15380
15379
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K;
15381
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q5_K;
15380
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
15381
+ new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15382
+ }
15382
15383
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
15383
15384
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
15384
15385
}
@@ -15394,9 +15395,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15394
15395
if (qs.model.hparams.n_expert >= 4) {
15395
15396
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
15396
15397
// TODO: explore better strategies
15397
- if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15398
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
15399
- ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q6_K;
15398
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
15399
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
15400
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K;
15400
15401
else new_type = GGML_TYPE_Q8_0;
15401
15402
}
15402
15403
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
@@ -15427,16 +15428,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15427
15428
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15428
15429
new_type = GGML_TYPE_Q4_K;
15429
15430
}
15430
- else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S ||
15431
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S ||
15431
15432
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
15432
15433
new_type = GGML_TYPE_Q5_K;
15433
15434
}
15434
15435
} else if (name.find("attn_q.weight") != std::string::npos) {
15435
15436
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS;
15436
15437
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
15437
15438
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
15438
- ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15439
+ ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
15439
15440
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K;
15441
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
15440
15442
}
15441
15443
} else if (name.find("ffn_down") != std::string::npos) {
15442
15444
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
@@ -15461,7 +15463,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
15461
15463
: GGML_TYPE_Q3_K;
15462
15464
}
15463
15465
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
15464
- (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) {
15466
+ (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) {
15465
15467
new_type = GGML_TYPE_Q4_K;
15466
15468
}
15467
15469
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
0 commit comments