@@ -3102,6 +3102,65 @@ def prepare_tensors(self):
3102
3102
if len (experts ) > 0 :
3103
3103
raise ValueError (f"Unprocessed experts: { experts } " )
3104
3104
3105
+ @Model .register ("DeepseekForCausalLM" )
3106
+ class DeepseekMoeModel (Model ):
3107
+ model_arch = gguf .MODEL_ARCH .DEEPSEEKMOE
3108
+
3109
+ def set_gguf_parameters (self ):
3110
+ super ().set_gguf_parameters ()
3111
+ hparams = self .hparams
3112
+ self .gguf_writer .add_leading_dense_block_count (hparams ["first_k_dense_replace" ])
3113
+ self .gguf_writer .add_expert_feed_forward_length (hparams ["moe_intermediate_size" ])
3114
+ self .gguf_writer .add_expert_count (hparams ["n_routed_experts" ])
3115
+ self .gguf_writer .add_expert_shared_count (hparams ["n_shared_experts" ])
3116
+
3117
+ _experts : list [dict [str , Tensor ]] | None = None
3118
+
3119
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3120
+ # process the experts separately
3121
+ if name .find ("mlp.experts" ) != - 1 :
3122
+ n_experts = self .hparams ["n_routed_experts" ]
3123
+ assert bid is not None
3124
+
3125
+ if self ._experts is None :
3126
+ self ._experts = [{} for _ in range (self .block_count )]
3127
+
3128
+ self ._experts [bid ][name ] = data_torch
3129
+
3130
+ if len (self ._experts [bid ]) >= n_experts * 3 :
3131
+ tensors : list [tuple [str , Tensor ]] = []
3132
+
3133
+ # merge the experts into a single 3d tensor
3134
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
3135
+ datas : list [Tensor ] = []
3136
+
3137
+ for xid in range (n_experts ):
3138
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
3139
+ datas .append (self ._experts [bid ][ename ])
3140
+ del self ._experts [bid ][ename ]
3141
+
3142
+ data_torch = torch .stack (datas , dim = 0 )
3143
+
3144
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
3145
+
3146
+ new_name = self .map_tensor_name (merged_name )
3147
+
3148
+ tensors .append ((new_name , data_torch ))
3149
+ return tensors
3150
+ else :
3151
+ return []
3152
+
3153
+ return [(self .map_tensor_name (name ), data_torch )]
3154
+
3155
+ def prepare_tensors (self ):
3156
+ super ().prepare_tensors ()
3157
+
3158
+ if self ._experts is not None :
3159
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
3160
+ experts = [k for d in self ._experts for k in d .keys ()]
3161
+ if len (experts ) > 0 :
3162
+ raise ValueError (f"Unprocessed experts: { experts } " )
3163
+
3105
3164
3106
3165
@Model .register ("DeepseekV2ForCausalLM" )
3107
3166
class DeepseekV2Model (Model ):
0 commit comments