@@ -286,6 +286,7 @@ enum llm_kv {
286
286
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
287
287
LLM_KV_FEED_FORWARD_LENGTH,
288
288
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
289
290
LLM_KV_USE_PARALLEL_RESIDUAL,
290
291
LLM_KV_TENSOR_DATA_LAYOUT,
291
292
LLM_KV_EXPERT_COUNT,
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
364
365
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
365
366
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
366
367
367
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
368
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
369
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
370
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
371
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
372
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
373
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
374
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
375
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
376
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
377
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
378
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
379
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
380
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
381
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
382
384
383
385
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
384
386
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1970,6 +1972,7 @@ struct llama_hparams {
1970
1972
uint32_t n_lora_q = 0;
1971
1973
uint32_t n_lora_kv = 0;
1972
1974
uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1973
1976
uint32_t n_expert_shared = 0;
1974
1977
float expert_weights_scale = 0.0;
1975
1978
@@ -2018,6 +2021,7 @@ struct llama_hparams {
2018
2021
if (this->n_lora_q != other.n_lora_q) return true;
2019
2022
if (this->n_lora_kv != other.n_lora_kv) return true;
2020
2023
if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
2021
2025
if (this->n_expert_shared != other.n_expert_shared) return true;
2022
2026
2023
2027
if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -4455,6 +4459,9 @@ static void llm_load_hparams(
4455
4459
} break;
4456
4460
case LLM_ARCH_QWEN2MOE:
4457
4461
{
4462
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4463
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4464
+
4458
4465
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4459
4466
switch (hparams.n_layer) {
4460
4467
case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -5240,6 +5247,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5240
5247
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5241
5248
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5242
5249
}
5250
+
5251
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5252
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5253
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5254
+ }
5243
5255
}
5244
5256
5245
5257
// Returns false if cancelled by progress_callback
@@ -6026,16 +6038,17 @@ static bool llm_load_tensors(
6026
6038
GGML_ASSERT(hparams.n_expert_used > 0);
6027
6039
6028
6040
// MoE branch
6029
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6041
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
6030
6042
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6031
6043
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
6032
6044
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
6033
6045
6034
6046
// Shared expert branch
6047
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
6035
6048
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
6036
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff });
6037
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff , n_embd});
6038
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff });
6049
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp });
6050
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp , n_embd});
6051
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp });
6039
6052
}
6040
6053
} break;
6041
6054
case LLM_ARCH_PHI2:
0 commit comments