@@ -6496,7 +6496,7 @@ struct llm_build_context {
6496
6496
LLM_NORM_RMS, cb, il);
6497
6497
cb(cur, "ffn_norm", il);
6498
6498
6499
- cur = build_moe (cur, n_tokens, il);
6499
+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_SILU , il);
6500
6500
}
6501
6501
6502
6502
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6528,7 +6528,8 @@ struct llm_build_context {
6528
6528
return gf;
6529
6529
}
6530
6530
6531
- ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) {
6531
+ // REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
6532
+ ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, int il) {
6532
6533
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
6533
6534
cb(logits, "ffn_moe_logits", il);
6534
6535
@@ -6560,13 +6561,25 @@ struct llm_build_context {
6560
6561
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
6561
6562
cb(cur_up, "ffn_moe_up", il);
6562
6563
6563
- ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6564
- cb(cur_gate , "ffn_moe_gate", il);
6564
+ ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6565
+ cb(gate , "ffn_moe_gate", il);
6565
6566
6566
- cur_gate = ggml_silu(ctx0, cur_gate);
6567
- cb(cur_gate, "ffn_moe_silu", il);
6567
+ switch (type_op) {
6568
+ case LLM_FFN_SILU:
6569
+ {
6570
+ gate = ggml_silu(ctx0, gate);
6571
+ cb(gate, "ffn_moe_silu", il);
6572
+ } break;
6573
+ case LLM_FFN_GELU:
6574
+ {
6575
+ gate = ggml_gelu(ctx0, gate);
6576
+ cb(gate, "ffn_moe_gelu", il);
6577
+ } break;
6578
+ default:
6579
+ GGML_ASSERT(false);
6580
+ }
6568
6581
6569
- cur_expert = ggml_mul(ctx0, cur_up, cur_gate );
6582
+ cur_expert = ggml_mul(ctx0, cur_up, gate );
6570
6583
cb(cur_expert, "ffn_moe_gate_par", il);
6571
6584
6572
6585
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
@@ -7034,7 +7047,7 @@ struct llm_build_context {
7034
7047
LLM_NORM_RMS, cb, il);
7035
7048
cb(cur, "ffn_norm", il);
7036
7049
7037
- cur = build_moe (cur, n_tokens, il);
7050
+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_GELU , il);
7038
7051
7039
7052
// Grok
7040
7053
// if layer_out_norm is present then apply it before adding the input
@@ -7170,7 +7183,7 @@ struct llm_build_context {
7170
7183
LLM_NORM, cb, il);
7171
7184
cb(cur, "attn_out_norm", il);
7172
7185
7173
- cur = build_moe (cur, n_tokens, il);
7186
+ cur = build_moe_ffn (cur, n_tokens, LLM_FFN_SILU , il);
7174
7187
7175
7188
cur = ggml_add(ctx0, cur, ffn_inp);
7176
7189
cb(cur, "ffn_out", il);
0 commit comments