refine naming && support deepseek v2 models

molamooo · molamooo · commit e8d247b59395 · 2024-09-12T06:27:12.000Z
diff --git a/ggml/include/custom-hook-common.hpp b/ggml/include/custom-hook-common.hpp
@@ -61,7 +61,7 @@ class ggml_tensor_extra_split_experts : public ggml_tensor_extra_interface {
         return this->per_expert_data[expert_id];
     }
 };
-class ggml_tensor_extra_split_expert : public ggml_tensor_extra_interface {
+class ggml_tensor_extra_moe_cache : public ggml_tensor_extra_interface {
   public:
     int moe_layer_id;
     int param_idx;
diff --git a/ggml/include/custom-hook.hpp b/ggml/include/custom-hook.hpp
@@ -19,8 +19,6 @@ struct sparse_llm_cache_context {
         this->profiler = std::make_shared<TimeProfiler>();
     }
 
-    std::unordered_map<void*, std::vector<void*>> expert_host_param_map; // from extra to host data
-
     void init_components();
     void register_expert_param(int moe_layer_id, int expert_id, std::string param_name, struct ggml_tensor* tensor_meta, void* data);
     void init_gpu_states();
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
@@ -31,7 +31,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int
 
 // split tensor buffer that splits matrices by rows across multiple devices
 struct sparse_llm_cache_context;
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_expert_split_buffer_type(sparse_llm_cache_context * cache_ctx);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_moe_cache_buffer_type(sparse_llm_cache_context * cache_ctx);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_expert_host_buffer_type(sparse_llm_cache_context * cache_ctx);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_expert_cuda_buffer_type(sparse_llm_cache_context * cache_ctx);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
diff --git a/ggml/src/custom-hook.cpp b/ggml/src/custom-hook.cpp
@@ -117,12 +117,12 @@ void sparse_llm_cache_context::init_gpu_states() {
   states->prefetch_mngr->launch_thread();
 }
 
-void * ggml_tensor_extra_split_expert::get_expert_param(int expert_id) {
+void * ggml_tensor_extra_moe_cache::get_expert_param(int expert_id) {
     // return this->get_expert_param_gpu(expert_id);
     return this->per_expert_host_data[expert_id];
 }
 
-void *ggml_tensor_extra_split_expert::get_expert_param_gpu(int expert_id) {
+void *ggml_tensor_extra_moe_cache::get_expert_param_gpu(int expert_id) {
   // return this->get_expert_param(expert_id);
   return cache_ctx->states->model_loader->get_source(this->moe_layer_id, expert_id)->reference_to_model_param->get_tensor(this->param_idx).data_ptr();
 }
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
diff --git a/include/llama.h b/include/llama.h
@@ -1189,7 +1189,7 @@ extern "C" {
 
 struct llama_model;
 void register_expert_params(struct llama_model* model);
-int32_t llama_model_layer_to_internal_moe_layer(struct llama_model *model, int32_t layer);
+int32_t llama_model_layer_to_internal_moe_layer(const struct llama_model *model, int32_t layer);
 // struct sparse_llm_cache_llama_context {
 // };
 
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -2208,7 +2208,7 @@ struct llama_hparams {
     uint32_t n_swa = 0; // sliding window attention (SWA)
     uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
     uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
-    uint32_t n_expert = 0;
+    uint32_t n_expert = 0; // num expert
     uint32_t n_expert_used = 0;
     uint32_t n_vocab_type = 0; // for BERT-style token types
     uint32_t n_rel_attn_bkts = 0;
@@ -2220,7 +2220,7 @@ struct llama_hparams {
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q = 0;
     uint32_t n_lora_kv = 0;
-    uint32_t n_ff_exp = 0;
+    uint32_t n_ff_exp = 0; // moe ffn intermediate size
     uint32_t n_ff_shexp = 0;
     uint32_t n_expert_shared = 0;
     float    expert_weights_scale = 0.0;
@@ -6571,7 +6571,7 @@ static bool llm_load_tensors(
     model.cache_ctx = new sparse_llm_cache_context();
 
     if (model.moe_cache_params->moe_cache_enabled) {
-        model.expert_buft = ggml_backend_cuda_expert_split_buffer_type(model.cache_ctx);
+        model.expert_buft = ggml_backend_cuda_moe_cache_buffer_type(model.cache_ctx);
     } else {
         switch (model.moe_cache_params->moe_buffer_type) {
             case kEntireHost:  { GGML_ABORT("Unimplemented"); }
@@ -7926,6 +7926,16 @@ static bool llm_load_tensors(
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
                         ggml_context * ctx_split = ctx_for_layer_split(i);
+                        ggml_context * ctx_experts = nullptr;
+                        if (model.moe_cache_params->moe_cache_enabled) {
+                            ctx_experts = ctx_map.at(model.expert_buft);
+                        } else if (model.moe_cache_params->moe_buffer_type == kLLAMACuda) {
+                            ctx_experts = ctx_layer;
+                        } else if (model.moe_cache_params->moe_buffer_type == kLLAMASplit) {
+                            ctx_experts = ctx_split;
+                        } else {
+                            ctx_experts = ctx_map.at(model.expert_buft);
+                        }
 
                         auto & layer = model.layers[i];
 
@@ -7960,9 +7970,9 @@ static bool llm_load_tensors(
                             GGML_ASSERT(n_expert_used > 0);
 
                             // MoE branch
-                            layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
-                            layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
-                            layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
+                            layer.ffn_gate_exps = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
+                            layer.ffn_down_exps = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
+                            layer.ffn_up_exps   = ml.create_tensor(ctx_experts, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
 
                             // Shared expert branch
                             layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
@@ -8424,6 +8434,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
             throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
         }
 
+        model.moe_cache_params->num_layer = model.hparams.n_layer - model.hparams.n_layer_dense_lead;
+        model.moe_cache_params->num_expert = model.hparams.n_expert;
+        model.moe_cache_params->num_expert_per_token = model.hparams.n_expert_used;
+
         llm_load_print_meta(ml, model);
 
         if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
@@ -8792,7 +8806,8 @@ struct ggml_tensor * llm_build_custom_hook(
     result->src[0] = a;
 
     ggml_set_op_params(result, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
-    ggml_set_op_params_i32(result, 2, il);
+    auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
+    ggml_set_op_params_i32(result, 2, moe_layer_id);
     ggml_set_op_params_i32(result, 3, hook_op);
     ggml_set_op_params_i32(result, 4, n_outputs);
 
@@ -8860,8 +8875,9 @@ static struct ggml_tensor * llm_build_moe_ffn(
     }
     ggml_tensor * experts = ggml_mul_mat_id_by_id(ctx, gate_exps, up_exps, down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
 
+    auto moe_layer_id = llama_model_layer_to_internal_moe_layer(&lctx.model, il);
     ggml_set_op_params(experts, &lctx.model.cache_ctx, sizeof(lctx.model.cache_ctx)); // 8bytes, 2 x 32bit int
-    ggml_set_op_params_i32(experts, 2, il);
+    ggml_set_op_params_i32(experts, 2, moe_layer_id);
     // ggml_set_op_params_i32(experts, 3, hook_op);
 
     experts = ggml_mul(ctx, experts, weights);
@@ -13623,6 +13639,10 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
+            if ((uint32_t) il >= hparams.n_layer_dense_lead) {
+                // this is a moe layer
+                inpL = llm_build_custom_hook(ctx0, lctx, inpL, il, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
+            }
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm, NULL,
@@ -13802,6 +13822,8 @@ struct llm_build_context {
             inpL = cur;
         }
 
+        inpL = llm_build_custom_hook(ctx0, lctx, inpL, n_layer, n_outputs, SPARSE_CACHE_HOOK_TYPE_LAYER_LOGITS);
+
         cur = inpL;
 
         cur = llm_build_norm(ctx0, cur, hparams,
@@ -20409,7 +20431,7 @@ void register_expert_params(llama_model* model) {
     auto capture_func = [cache_ctx, model](ggml_tensor * moe_param, std::string name, int layer_id, int param_idx) {
         auto num_experts = moe_param->ne[2];
         auto moe_layer_id = llama_model_layer_to_internal_moe_layer(model, layer_id);
-        ggml_tensor_extra_split_expert * extra = (ggml_tensor_extra_split_expert *)moe_param->extra;
+        ggml_tensor_extra_moe_cache * extra = (ggml_tensor_extra_moe_cache *)moe_param->extra;
         for (int expert_id = 0; expert_id < num_experts; expert_id++) {
             cache_ctx->register_expert_param(moe_layer_id, expert_id, name, moe_param, extra->get_expert_param(expert_id));
         }
@@ -20427,7 +20449,7 @@ void register_expert_params(llama_model* model) {
         capture_func(layer.ffn_down_exps, "down", layer_id, 2);
     }
 }
-int32_t llama_model_layer_to_internal_moe_layer(llama_model *model, int32_t layer) {
+int32_t llama_model_layer_to_internal_moe_layer(const llama_model *model, int32_t layer) {
     switch (model->arch) {
         case llm_arch::LLM_ARCH_QWEN2MOE:
             return layer;

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ class ggml_tensor_extra_split_experts : public ggml_tensor_extra_interface {`
`61`	`61`	`return this->per_expert_data[expert_id];`
`62`	`62`	`}`
`63`	`63`	`};`
`64`		`-class ggml_tensor_extra_split_expert : public ggml_tensor_extra_interface {`
	`64`	`+class ggml_tensor_extra_moe_cache : public ggml_tensor_extra_interface {`
`65`	`65`	`public:`
`66`	`66`	`int moe_layer_id;`
`67`	`67`	`int param_idx;`
Original file line number	Diff line number	Diff line change
`@@ -19,8 +19,6 @@ struct sparse_llm_cache_context {`
`19`	`19`	`this->profiler = std::make_shared<TimeProfiler>();`
`20`	`20`	`}`
`21`	`21`
`22`		`- std::unordered_map<void, std::vector<void>> expert_host_param_map; // from extra to host data`
`23`		`-`
`24`	`22`	`void init_components();`
`25`	`23`	`void register_expert_param(int moe_layer_id, int expert_id, std::string param_name, struct ggml_tensor* tensor_meta, void* data);`
`26`	`24`	`void init_gpu_states();`
Original file line number	Diff line number	Diff line change
`@@ -117,12 +117,12 @@ void sparse_llm_cache_context::init_gpu_states() {`
`117`	`117`	`states->prefetch_mngr->launch_thread();`
`118`	`118`	`}`
`119`	`119`
`120`		`-void * ggml_tensor_extra_split_expert::get_expert_param(int expert_id) {`
	`120`	`+void * ggml_tensor_extra_moe_cache::get_expert_param(int expert_id) {`
`121`	`121`	`// return this->get_expert_param_gpu(expert_id);`
`122`	`122`	`return this->per_expert_host_data[expert_id];`
`123`	`123`	`}`
`124`	`124`
`125`		`-void *ggml_tensor_extra_split_expert::get_expert_param_gpu(int expert_id) {`
	`125`	`+void *ggml_tensor_extra_moe_cache::get_expert_param_gpu(int expert_id) {`
`126`	`126`	`// return this->get_expert_param(expert_id);`
`127`	`127`	`return cache_ctx->states->model_loader->get_source(this->moe_layer_id, expert_id)->reference_to_model_param->get_tensor(this->param_idx).data_ptr();`
`128`	`128`	`}`