@@ -2208,7 +2208,7 @@ struct llama_hparams {
2208
2208
uint32_t n_swa = 0; // sliding window attention (SWA)
2209
2209
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
2210
2210
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
2211
- uint32_t n_expert = 0;
2211
+ uint32_t n_expert = 0; // num expert
2212
2212
uint32_t n_expert_used = 0;
2213
2213
uint32_t n_vocab_type = 0; // for BERT-style token types
2214
2214
uint32_t n_rel_attn_bkts = 0;
@@ -2220,7 +2220,7 @@ struct llama_hparams {
2220
2220
uint32_t n_layer_dense_lead = 0;
2221
2221
uint32_t n_lora_q = 0;
2222
2222
uint32_t n_lora_kv = 0;
2223
- uint32_t n_ff_exp = 0;
2223
+ uint32_t n_ff_exp = 0; // moe ffn intermediate size
2224
2224
uint32_t n_ff_shexp = 0;
2225
2225
uint32_t n_expert_shared = 0;
2226
2226
float expert_weights_scale = 0.0;
@@ -6571,7 +6571,7 @@ static bool llm_load_tensors(
6571
6571
model.cache_ctx = new sparse_llm_cache_context();
6572
6572
6573
6573
if (model.moe_cache_params->moe_cache_enabled) {
6574
- model.expert_buft = ggml_backend_cuda_expert_split_buffer_type (model.cache_ctx);
6574
+ model.expert_buft = ggml_backend_cuda_moe_cache_buffer_type (model.cache_ctx);
6575
6575
} else {
6576
6576
switch (model.moe_cache_params->moe_buffer_type) {
6577
6577
case kEntireHost: { GGML_ABORT("Unimplemented"); }
@@ -7926,6 +7926,16 @@ static bool llm_load_tensors(
7926
7926
for (int i = 0; i < n_layer; ++i) {
7927
7927
ggml_context * ctx_layer = ctx_for_layer(i);
7928
7928
ggml_context * ctx_split = ctx_for_layer_split(i);
7929
+ ggml_context * ctx_experts = nullptr;
7930
+ if (model.moe_cache_params->moe_cache_enabled) {
7931
+ ctx_experts = ctx_map.at(model.expert_buft);
7932
+ } else if (model.moe_cache_params->moe_buffer_type == kLLAMACuda) {
7933
+ ctx_experts = ctx_layer;
7934
+ } else if (model.moe_cache_params->moe_buffer_type == kLLAMASplit) {
7935
+ ctx_experts = ctx_split;
7936
+ } else {
7937
+ ctx_experts = ctx_map.at(model.expert_buft);
7938
+ }
7929
7939
7930
7940
auto & layer = model.layers[i];
7931
7941
@@ -7960,9 +7970,9 @@ static bool llm_load_tensors(
7960
7970
GGML_ASSERT(n_expert_used > 0);
7961
7971
7962
7972
// MoE branch
7963
- layer.ffn_gate_exps = ml.create_tensor(ctx_split , tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7964
- layer.ffn_down_exps = ml.create_tensor(ctx_split , tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
7965
- layer.ffn_up_exps = ml.create_tensor(ctx_split , tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7973
+ layer.ffn_gate_exps = ml.create_tensor(ctx_experts , tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7974
+ layer.ffn_down_exps = ml.create_tensor(ctx_experts , tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
7975
+ layer.ffn_up_exps = ml.create_tensor(ctx_experts , tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7966
7976
7967
7977
// Shared expert branch
7968
7978
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
@@ -8424,6 +8434,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8424
8434
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
8425
8435
}
8426
8436
8437
+ model.moe_cache_params->num_layer = model.hparams.n_layer - model.hparams.n_layer_dense_lead;
8438
+ model.moe_cache_params->num_expert = model.hparams.n_expert;
8439
+ model.moe_cache_params->num_expert_per_token = model.hparams.n_expert_used;
8440
+
8427
8441
llm_load_print_meta(ml, model);
8428
8442
8429
8443
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -8792,7 +8806,8 @@ struct ggml_tensor * llm_build_custom_hook(
8792
8806
result->src[0] = a;
8793
8807
8794
8808
ggml_set_op_params(result, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
8795
- ggml_set_op_params_i32(result, 2, il);
8809
+ auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
8810
+ ggml_set_op_params_i32(result, 2, moe_layer_id);
8796
8811
ggml_set_op_params_i32(result, 3, hook_op);
8797
8812
ggml_set_op_params_i32(result, 4, n_outputs);
8798
8813
@@ -8860,8 +8875,9 @@ static struct ggml_tensor * llm_build_moe_ffn(
8860
8875
}
8861
8876
ggml_tensor * experts = ggml_mul_mat_id_by_id(ctx, gate_exps, up_exps, down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
8862
8877
8878
+ auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
8863
8879
ggml_set_op_params(experts, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
8864
- ggml_set_op_params_i32(experts, 2, il );
8880
+ ggml_set_op_params_i32(experts, 2, moe_layer_id );
8865
8881
// ggml_set_op_params_i32(experts, 3, hook_op);
8866
8882
8867
8883
experts = ggml_mul(ctx, experts, weights);
@@ -13623,6 +13639,10 @@ struct llm_build_context {
13623
13639
for (int il = 0; il < n_layer; ++il) {
13624
13640
struct ggml_tensor * inpSA = inpL;
13625
13641
13642
+ if ((uint32_t) il >= hparams.n_layer_dense_lead) {
13643
+ // this is a moe layer
13644
+ inpL = llm_build_custom_hook(ctx0, lctx, inpL, il, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
13645
+ }
13626
13646
// norm
13627
13647
cur = llm_build_norm(ctx0, inpL, hparams,
13628
13648
model.layers[il].attn_norm, NULL,
@@ -13802,6 +13822,8 @@ struct llm_build_context {
13802
13822
inpL = cur;
13803
13823
}
13804
13824
13825
+ inpL = llm_build_custom_hook(ctx0, lctx, inpL, n_layer, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
13826
+
13805
13827
cur = inpL;
13806
13828
13807
13829
cur = llm_build_norm(ctx0, cur, hparams,
@@ -20409,7 +20431,7 @@ void register_expert_params(llama_model* model) {
20409
20431
auto capture_func = [cache_ctx, model](ggml_tensor * moe_param, std::string name, int layer_id, int param_idx) {
20410
20432
auto num_experts = moe_param->ne[2];
20411
20433
auto moe_layer_id = llama_model_layer_to_internal_moe_layer(model, layer_id);
20412
- ggml_tensor_extra_split_expert * extra = (ggml_tensor_extra_split_expert *)moe_param->extra;
20434
+ ggml_tensor_extra_moe_cache * extra = (ggml_tensor_extra_moe_cache *)moe_param->extra;
20413
20435
for (int expert_id = 0; expert_id < num_experts; expert_id++) {
20414
20436
cache_ctx->register_expert_param(moe_layer_id, expert_id, name, moe_param, extra->get_expert_param(expert_id));
20415
20437
}
@@ -20427,7 +20449,7 @@ void register_expert_params(llama_model* model) {
20427
20449
capture_func(layer.ffn_down_exps, "down", layer_id, 2);
20428
20450
}
20429
20451
}
20430
- int32_t llama_model_layer_to_internal_moe_layer(llama_model *model, int32_t layer) {
20452
+ int32_t llama_model_layer_to_internal_moe_layer(const llama_model *model, int32_t layer) {
20431
20453
switch (model->arch) {
20432
20454
case llm_arch::LLM_ARCH_QWEN2MOE:
20433
20455
return layer;
0 commit comments