Skip to content

Commit f1256dc

Browse files
committed
llama: rename build_moe to build_moe_ffn and fix grok is using gelu instead of silu.
Do not pass too much time on this function as it will be replaced in #6505
1 parent 8e6758f commit f1256dc

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

llama.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6496,7 +6496,7 @@ struct llm_build_context {
64966496
LLM_NORM_RMS, cb, il);
64976497
cb(cur, "ffn_norm", il);
64986498

6499-
cur = build_moe(cur, n_tokens, il);
6499+
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il);
65006500
}
65016501

65026502
cur = ggml_add(ctx0, cur, ffn_inp);
@@ -6528,7 +6528,8 @@ struct llm_build_context {
65286528
return gf;
65296529
}
65306530

6531-
ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) {
6531+
// REVIEW: will be replaced by https://github.com/ggerganov/llama.cpp/pull/6505
6532+
ggml_tensor * build_moe_ffn(ggml_tensor * cur, int32_t n_tokens, llm_ffn_op_type type_op, int il) {
65326533
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
65336534
cb(logits, "ffn_moe_logits", il);
65346535

@@ -6560,13 +6561,25 @@ struct llm_build_context {
65606561
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
65616562
cb(cur_up, "ffn_moe_up", il);
65626563

6563-
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6564-
cb(cur_gate, "ffn_moe_gate", il);
6564+
ggml_tensor * gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
6565+
cb(gate, "ffn_moe_gate", il);
65656566

6566-
cur_gate = ggml_silu(ctx0, cur_gate);
6567-
cb(cur_gate, "ffn_moe_silu", il);
6567+
switch (type_op) {
6568+
case LLM_FFN_SILU:
6569+
{
6570+
gate = ggml_silu(ctx0, gate);
6571+
cb(gate, "ffn_moe_silu", il);
6572+
} break;
6573+
case LLM_FFN_GELU:
6574+
{
6575+
gate = ggml_gelu(ctx0, gate);
6576+
cb(gate, "ffn_moe_gelu", il);
6577+
} break;
6578+
default:
6579+
GGML_ASSERT(false);
6580+
}
65686581

6569-
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
6582+
cur_expert = ggml_mul(ctx0, cur_up, gate);
65706583
cb(cur_expert, "ffn_moe_gate_par", il);
65716584

65726585
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
@@ -7034,7 +7047,7 @@ struct llm_build_context {
70347047
LLM_NORM_RMS, cb, il);
70357048
cb(cur, "ffn_norm", il);
70367049

7037-
cur = build_moe(cur, n_tokens, il);
7050+
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_GELU, il);
70387051

70397052
// Grok
70407053
// if layer_out_norm is present then apply it before adding the input
@@ -7170,7 +7183,7 @@ struct llm_build_context {
71707183
LLM_NORM, cb, il);
71717184
cb(cur, "attn_out_norm", il);
71727185

7173-
cur = build_moe(cur, n_tokens, il);
7186+
cur = build_moe_ffn(cur, n_tokens, LLM_FFN_SILU, il);
71747187

71757188
cur = ggml_add(ctx0, cur, ffn_inp);
71767189
cb(cur, "ffn_out", il);

0 commit comments

Comments
 (0)