@@ -578,8 +578,11 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
578
578
{ LLM_TENSOR_ATTN_K, " blk.%d.attn_k" },
579
579
{ LLM_TENSOR_ATTN_V, " blk.%d.attn_v" },
580
580
{ LLM_TENSOR_ATTN_OUT, " blk.%d.attn_output" },
581
+ { LLM_TENSOR_FFN_GATE_INP, " blk.%d.ffn_gate_inp" },
581
582
{ LLM_TENSOR_FFN_DOWN, " blk.%d.ffn_down" },
582
583
{ LLM_TENSOR_FFN_UP, " blk.%d.ffn_up" },
584
+ { LLM_TENSOR_FFN_DOWN_EXP, " blk.%d.ffn_down.%d" },
585
+ { LLM_TENSOR_FFN_UP_EXP, " blk.%d.ffn_up.%d" },
583
586
},
584
587
},
585
588
{
@@ -1425,16 +1428,20 @@ struct llama_layer {
1425
1428
struct ggml_tensor * ffn_down; // w2
1426
1429
struct ggml_tensor * ffn_up; // w3
1427
1430
1431
+ // ff bias
1432
+ struct ggml_tensor * ffn_down_b; // b2
1433
+ struct ggml_tensor * ffn_up_b; // b3
1434
+ struct ggml_tensor * ffn_act;
1435
+
1428
1436
// ff MoE
1429
1437
struct ggml_tensor * ffn_gate_inp;
1430
1438
struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
1431
1439
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
1432
1440
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];
1433
1441
1434
- // ff bias
1435
- struct ggml_tensor * ffn_down_b; // b2
1436
- struct ggml_tensor * ffn_up_b; // b3
1437
- struct ggml_tensor * ffn_act;
1442
+ // ff MoE bias
1443
+ struct ggml_tensor * ffn_down_b_exp[LLAMA_MAX_EXPERTS];
1444
+ struct ggml_tensor * ffn_up_b_exp [LLAMA_MAX_EXPERTS];
1438
1445
};
1439
1446
1440
1447
struct llama_kv_cell {
@@ -3696,11 +3703,29 @@ static bool llm_load_tensors(
3696
3703
layer.wo = ml.create_tensor (ctx_split, tn (LLM_TENSOR_ATTN_OUT, " weight" , i), {n_embd, n_embd});
3697
3704
layer.bo = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_ATTN_OUT, " bias" , i), {n_embd});
3698
3705
3699
- layer.ffn_down = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd});
3700
- layer.ffn_down_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd});
3706
+ layer.ffn_gate_inp = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_GATE_INP, " weight" , i), {n_embd}, false );
3701
3707
3702
- layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
3703
- layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
3708
+ if (layer.ffn_gate_inp == nullptr ) {
3709
+ GGML_ASSERT (hparams.n_expert == 0 );
3710
+ GGML_ASSERT (hparams.n_expert_used == 0 );
3711
+
3712
+ layer.ffn_down = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), {n_ff, n_embd});
3713
+ layer.ffn_down_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN, " bias" , i), {n_embd});
3714
+
3715
+ layer.ffn_up = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff});
3716
+ layer.ffn_up_b = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP, " bias" , i), {n_ff});
3717
+ } else {
3718
+ GGML_ASSERT (hparams.n_expert > 0 );
3719
+ GGML_ASSERT (hparams.n_expert_used > 0 );
3720
+
3721
+ for (uint32_t x = 0 ; x < hparams.n_expert ; ++x) {
3722
+ layer.ffn_down_exp [x] = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_DOWN_EXP, " weight" , i, x), {n_ff, n_embd});
3723
+ layer.ffn_down_b_exp [x] = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_DOWN_EXP, " bias" , i, x), {n_embd});
3724
+
3725
+ layer.ffn_up_exp [x] = ml.create_tensor (ctx_split, tn (LLM_TENSOR_FFN_UP_EXP, " weight" , i, x), {n_embd, n_ff});
3726
+ layer.ffn_up_b_exp [x] = ml.create_tensor (ctx_layer, tn (LLM_TENSOR_FFN_UP_EXP, " bias" , i, x), {n_ff});
3727
+ }
3728
+ }
3704
3729
}
3705
3730
} break ;
3706
3731
case LLM_ARCH_PLAMO:
@@ -5704,14 +5729,70 @@ struct llm_build_context {
5704
5729
}
5705
5730
5706
5731
// FF
5707
- {
5732
+ if (model. layers [il]. ffn_gate_inp == nullptr ) {
5708
5733
ffn_output = llm_build_ffn (ctx0, attn_norm_output,
5709
5734
model.layers [il].ffn_up , model.layers [il].ffn_up_b ,
5710
5735
NULL , NULL ,
5711
5736
model.layers [il].ffn_down , model.layers [il].ffn_down_b ,
5712
5737
NULL ,
5713
5738
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
5714
5739
cb (ffn_output, " ffn_out" , il);
5740
+ } else {
5741
+ // MoE branch
5742
+ ggml_tensor * logits = ggml_mul_mat (ctx0, model.layers [il].ffn_gate_inp , cur); // [n_tokens, num_experts]
5743
+ cb (logits, " ffn_moe_logits" , il);
5744
+
5745
+ ggml_tensor * probs = ggml_soft_max (ctx0, logits); // [n_tokens, num_experts]
5746
+ cb (probs, " ffn_moe_probs" , il);
5747
+
5748
+ // select experts
5749
+ ggml_tensor * selected_experts = ggml_top_k (ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
5750
+ cb (selected_experts->src [0 ], " ffn_moe_argsort" , il);
5751
+
5752
+ ggml_tensor * weights = ggml_get_rows (ctx0,
5753
+ ggml_reshape_3d (ctx0, probs, 1 , n_expert, n_tokens), selected_experts);
5754
+ cb (weights, " ffn_moe_weights" , il);
5755
+
5756
+ weights = ggml_reshape_2d (ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
5757
+
5758
+ ggml_tensor * weights_sum = ggml_sum_rows (ctx0, weights);
5759
+ cb (weights_sum, " ffn_moe_weights_sum" , il);
5760
+
5761
+ weights = ggml_div (ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
5762
+ cb (weights, " ffn_moe_weights_norm" , il);
5763
+
5764
+ // compute expert outputs
5765
+ ggml_tensor * moe_out = nullptr ;
5766
+
5767
+ for (int i = 0 ; i < n_expert_used; ++i) {
5768
+ ggml_tensor * cur_expert;
5769
+
5770
+ ggml_tensor * cur_up = ggml_mul_mat_id (ctx0, model.layers [il].ffn_up_exp , n_expert, selected_experts, i, cur);
5771
+ #pragma message "TODO: implement ggml_add_id"
5772
+ // cur_up = ggml_add_id(ctx0, cur_up, model.layers[il].ffn_up_exp_b, n_expert, selected_experts, i);
5773
+ cb (cur_up, " ffn_moe_up" , il);
5774
+
5775
+ cur_up = ggml_gelu (ctx0, cur_up);
5776
+ cb (cur_up, " ffn_moe_gelu" , il);
5777
+
5778
+ cur_expert = ggml_mul_mat_id (ctx0, model.layers [il].ffn_down_exp , n_expert, selected_experts, i, cur_up); // [n_tokens, n_embd]
5779
+ #pragma message "TODO: implement ggml_add_id"
5780
+ // cur_expert = ggml_add_id(ctx0, cur_expert, model.layers[il].ffn_down_exp_b, n_expert, selected_experts, i);
5781
+ cb (cur_expert, " ffn_moe_down" , il);
5782
+
5783
+ cur_expert = ggml_mul (ctx0, cur_expert,
5784
+ ggml_view_2d (ctx0, weights, 1 , n_tokens, weights->nb [1 ], i*weights->nb [0 ]));
5785
+ cb (cur_expert, " ffn_moe_weighted" , il);
5786
+
5787
+ if (i == 0 ) {
5788
+ moe_out = cur_expert;
5789
+ } else {
5790
+ moe_out = ggml_add (ctx0, moe_out, cur_expert);
5791
+ cb (moe_out, " ffn_moe_out" , il);
5792
+ }
5793
+ }
5794
+
5795
+ ffn_output = moe_out;
5715
5796
}
5716
5797
5717
5798
cur = ggml_add (ctx0, cur, ffn_output);
0 commit comments