Skip to content

Commit e8d247b

Browse files
committed
refine naming && support deepseek v2 models
1 parent e198ad4 commit e8d247b

File tree

7 files changed

+99
-91
lines changed

7 files changed

+99
-91
lines changed

ggml/include/custom-hook-common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class ggml_tensor_extra_split_experts : public ggml_tensor_extra_interface {
6161
return this->per_expert_data[expert_id];
6262
}
6363
};
64-
class ggml_tensor_extra_split_expert : public ggml_tensor_extra_interface {
64+
class ggml_tensor_extra_moe_cache : public ggml_tensor_extra_interface {
6565
public:
6666
int moe_layer_id;
6767
int param_idx;

ggml/include/custom-hook.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ struct sparse_llm_cache_context {
1919
this->profiler = std::make_shared<TimeProfiler>();
2020
}
2121

22-
std::unordered_map<void*, std::vector<void*>> expert_host_param_map; // from extra to host data
23-
2422
void init_components();
2523
void register_expert_param(int moe_layer_id, int expert_id, std::string param_name, struct ggml_tensor* tensor_meta, void* data);
2624
void init_gpu_states();

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int
3131

3232
// split tensor buffer that splits matrices by rows across multiple devices
3333
struct sparse_llm_cache_context;
34-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_expert_split_buffer_type(sparse_llm_cache_context * cache_ctx);
34+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_moe_cache_buffer_type(sparse_llm_cache_context * cache_ctx);
3535
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_expert_host_buffer_type(sparse_llm_cache_context * cache_ctx);
3636
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_expert_cuda_buffer_type(sparse_llm_cache_context * cache_ctx);
3737
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);

ggml/src/custom-hook.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ void sparse_llm_cache_context::init_gpu_states() {
117117
states->prefetch_mngr->launch_thread();
118118
}
119119

120-
void * ggml_tensor_extra_split_expert::get_expert_param(int expert_id) {
120+
void * ggml_tensor_extra_moe_cache::get_expert_param(int expert_id) {
121121
// return this->get_expert_param_gpu(expert_id);
122122
return this->per_expert_host_data[expert_id];
123123
}
124124

125-
void *ggml_tensor_extra_split_expert::get_expert_param_gpu(int expert_id) {
125+
void *ggml_tensor_extra_moe_cache::get_expert_param_gpu(int expert_id) {
126126
// return this->get_expert_param(expert_id);
127127
return cache_ctx->states->model_loader->get_source(this->moe_layer_id, expert_id)->reference_to_model_param->get_tensor(this->param_idx).data_ptr();
128128
}

ggml/src/ggml-cuda.cu

Lines changed: 62 additions & 74 deletions
Large diffs are not rendered by default.

include/llama.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1189,7 +1189,7 @@ extern "C" {
11891189

11901190
struct llama_model;
11911191
void register_expert_params(struct llama_model* model);
1192-
int32_t llama_model_layer_to_internal_moe_layer(struct llama_model *model, int32_t layer);
1192+
int32_t llama_model_layer_to_internal_moe_layer(const struct llama_model *model, int32_t layer);
11931193
// struct sparse_llm_cache_llama_context {
11941194
// };
11951195

src/llama.cpp

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2208,7 +2208,7 @@ struct llama_hparams {
22082208
uint32_t n_swa = 0; // sliding window attention (SWA)
22092209
uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
22102210
uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
2211-
uint32_t n_expert = 0;
2211+
uint32_t n_expert = 0; // num expert
22122212
uint32_t n_expert_used = 0;
22132213
uint32_t n_vocab_type = 0; // for BERT-style token types
22142214
uint32_t n_rel_attn_bkts = 0;
@@ -2220,7 +2220,7 @@ struct llama_hparams {
22202220
uint32_t n_layer_dense_lead = 0;
22212221
uint32_t n_lora_q = 0;
22222222
uint32_t n_lora_kv = 0;
2223-
uint32_t n_ff_exp = 0;
2223+
uint32_t n_ff_exp = 0; // moe ffn intermediate size
22242224
uint32_t n_ff_shexp = 0;
22252225
uint32_t n_expert_shared = 0;
22262226
float expert_weights_scale = 0.0;
@@ -6571,7 +6571,7 @@ static bool llm_load_tensors(
65716571
model.cache_ctx = new sparse_llm_cache_context();
65726572

65736573
if (model.moe_cache_params->moe_cache_enabled) {
6574-
model.expert_buft = ggml_backend_cuda_expert_split_buffer_type(model.cache_ctx);
6574+
model.expert_buft = ggml_backend_cuda_moe_cache_buffer_type(model.cache_ctx);
65756575
} else {
65766576
switch (model.moe_cache_params->moe_buffer_type) {
65776577
case kEntireHost: { GGML_ABORT("Unimplemented"); }
@@ -7926,6 +7926,16 @@ static bool llm_load_tensors(
79267926
for (int i = 0; i < n_layer; ++i) {
79277927
ggml_context * ctx_layer = ctx_for_layer(i);
79287928
ggml_context * ctx_split = ctx_for_layer_split(i);
7929+
ggml_context * ctx_experts = nullptr;
7930+
if (model.moe_cache_params->moe_cache_enabled) {
7931+
ctx_experts = ctx_map.at(model.expert_buft);
7932+
} else if (model.moe_cache_params->moe_buffer_type == kLLAMACuda) {
7933+
ctx_experts = ctx_layer;
7934+
} else if (model.moe_cache_params->moe_buffer_type == kLLAMASplit) {
7935+
ctx_experts = ctx_split;
7936+
} else {
7937+
ctx_experts = ctx_map.at(model.expert_buft);
7938+
}
79297939

79307940
auto & layer = model.layers[i];
79317941

@@ -7960,9 +7970,9 @@ static bool llm_load_tensors(
79607970
GGML_ASSERT(n_expert_used > 0);
79617971

79627972
// MoE branch
7963-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7964-
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
7965-
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7973+
layer.ffn_gate_exps = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
7974+
layer.ffn_down_exps = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
7975+
layer.ffn_up_exps = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
79667976

79677977
// Shared expert branch
79687978
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
@@ -8424,6 +8434,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
84248434
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
84258435
}
84268436

8437+
model.moe_cache_params->num_layer = model.hparams.n_layer - model.hparams.n_layer_dense_lead;
8438+
model.moe_cache_params->num_expert = model.hparams.n_expert;
8439+
model.moe_cache_params->num_expert_per_token = model.hparams.n_expert_used;
8440+
84278441
llm_load_print_meta(ml, model);
84288442

84298443
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -8792,7 +8806,8 @@ struct ggml_tensor * llm_build_custom_hook(
87928806
result->src[0] = a;
87938807

87948808
ggml_set_op_params(result, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
8795-
ggml_set_op_params_i32(result, 2, il);
8809+
auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
8810+
ggml_set_op_params_i32(result, 2, moe_layer_id);
87968811
ggml_set_op_params_i32(result, 3, hook_op);
87978812
ggml_set_op_params_i32(result, 4, n_outputs);
87988813

@@ -8860,8 +8875,9 @@ static struct ggml_tensor * llm_build_moe_ffn(
88608875
}
88618876
ggml_tensor * experts = ggml_mul_mat_id_by_id(ctx, gate_exps, up_exps, down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
88628877

8878+
auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
88638879
ggml_set_op_params(experts, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
8864-
ggml_set_op_params_i32(experts, 2, il);
8880+
ggml_set_op_params_i32(experts, 2, moe_layer_id);
88658881
// ggml_set_op_params_i32(experts, 3, hook_op);
88668882

88678883
experts = ggml_mul(ctx, experts, weights);
@@ -13623,6 +13639,10 @@ struct llm_build_context {
1362313639
for (int il = 0; il < n_layer; ++il) {
1362413640
struct ggml_tensor * inpSA = inpL;
1362513641

13642+
if ((uint32_t) il >= hparams.n_layer_dense_lead) {
13643+
// this is a moe layer
13644+
inpL = llm_build_custom_hook(ctx0, lctx, inpL, il, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
13645+
}
1362613646
// norm
1362713647
cur = llm_build_norm(ctx0, inpL, hparams,
1362813648
model.layers[il].attn_norm, NULL,
@@ -13802,6 +13822,8 @@ struct llm_build_context {
1380213822
inpL = cur;
1380313823
}
1380413824

13825+
inpL = llm_build_custom_hook(ctx0, lctx, inpL, n_layer, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
13826+
1380513827
cur = inpL;
1380613828

1380713829
cur = llm_build_norm(ctx0, cur, hparams,
@@ -20409,7 +20431,7 @@ void register_expert_params(llama_model* model) {
2040920431
auto capture_func = [cache_ctx, model](ggml_tensor * moe_param, std::string name, int layer_id, int param_idx) {
2041020432
auto num_experts = moe_param->ne[2];
2041120433
auto moe_layer_id = llama_model_layer_to_internal_moe_layer(model, layer_id);
20412-
ggml_tensor_extra_split_expert * extra = (ggml_tensor_extra_split_expert *)moe_param->extra;
20434+
ggml_tensor_extra_moe_cache * extra = (ggml_tensor_extra_moe_cache *)moe_param->extra;
2041320435
for (int expert_id = 0; expert_id < num_experts; expert_id++) {
2041420436
cache_ctx->register_expert_param(moe_layer_id, expert_id, name, moe_param, extra->get_expert_param(expert_id));
2041520437
}
@@ -20427,7 +20449,7 @@ void register_expert_params(llama_model* model) {
2042720449
capture_func(layer.ffn_down_exps, "down", layer_id, 2);
2042820450
}
2042920451
}
20430-
int32_t llama_model_layer_to_internal_moe_layer(llama_model *model, int32_t layer) {
20452+
int32_t llama_model_layer_to_internal_moe_layer(const llama_model *model, int32_t layer) {
2043120453
switch (model->arch) {
2043220454
case llm_arch::LLM_ARCH_QWEN2MOE:
2043320455
return layer;

0 commit comments

Comments
 (0)