Skip to content

Commit e146c30

Browse files
committed
support deepseek moe model
1 parent e8d247b commit e146c30

File tree

3 files changed

+354
-0
lines changed

3 files changed

+354
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3102,6 +3102,65 @@ def prepare_tensors(self):
31023102
if len(experts) > 0:
31033103
raise ValueError(f"Unprocessed experts: {experts}")
31043104

3105+
@Model.register("DeepseekForCausalLM")
3106+
class DeepseekMoeModel(Model):
3107+
model_arch = gguf.MODEL_ARCH.DEEPSEEKMOE
3108+
3109+
def set_gguf_parameters(self):
3110+
super().set_gguf_parameters()
3111+
hparams = self.hparams
3112+
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
3113+
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
3114+
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
3115+
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
3116+
3117+
_experts: list[dict[str, Tensor]] | None = None
3118+
3119+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3120+
# process the experts separately
3121+
if name.find("mlp.experts") != -1:
3122+
n_experts = self.hparams["n_routed_experts"]
3123+
assert bid is not None
3124+
3125+
if self._experts is None:
3126+
self._experts = [{} for _ in range(self.block_count)]
3127+
3128+
self._experts[bid][name] = data_torch
3129+
3130+
if len(self._experts[bid]) >= n_experts * 3:
3131+
tensors: list[tuple[str, Tensor]] = []
3132+
3133+
# merge the experts into a single 3d tensor
3134+
for w_name in ["down_proj", "gate_proj", "up_proj"]:
3135+
datas: list[Tensor] = []
3136+
3137+
for xid in range(n_experts):
3138+
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3139+
datas.append(self._experts[bid][ename])
3140+
del self._experts[bid][ename]
3141+
3142+
data_torch = torch.stack(datas, dim=0)
3143+
3144+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3145+
3146+
new_name = self.map_tensor_name(merged_name)
3147+
3148+
tensors.append((new_name, data_torch))
3149+
return tensors
3150+
else:
3151+
return []
3152+
3153+
return [(self.map_tensor_name(name), data_torch)]
3154+
3155+
def prepare_tensors(self):
3156+
super().prepare_tensors()
3157+
3158+
if self._experts is not None:
3159+
# flatten `list[dict[str, Tensor]]` into `list[str]`
3160+
experts = [k for d in self._experts for k in d.keys()]
3161+
if len(experts) > 0:
3162+
raise ValueError(f"Unprocessed experts: {experts}")
3163+
31053164

31063165
@Model.register("DeepseekV2ForCausalLM")
31073166
class DeepseekV2Model(Model):

gguf-py/gguf/constants.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ class MODEL_ARCH(IntEnum):
221221
T5ENCODER = auto()
222222
JAIS = auto()
223223
NEMOTRON = auto()
224+
DEEPSEEKMOE = auto()
224225
EXAONE = auto()
225226

226227

@@ -351,6 +352,7 @@ class MODEL_TENSOR(IntEnum):
351352
MODEL_ARCH.T5ENCODER: "t5encoder",
352353
MODEL_ARCH.JAIS: "jais",
353354
MODEL_ARCH.NEMOTRON: "nemotron",
355+
MODEL_ARCH.DEEPSEEKMOE: "deepseek-moe",
354356
MODEL_ARCH.EXAONE: "exaone",
355357
}
356358

@@ -957,6 +959,27 @@ class MODEL_TENSOR(IntEnum):
957959
MODEL_TENSOR.FFN_DOWN_EXP,
958960
MODEL_TENSOR.FFN_UP_EXP,
959961
],
962+
MODEL_ARCH.DEEPSEEKMOE: [
963+
MODEL_TENSOR.TOKEN_EMBD,
964+
MODEL_TENSOR.OUTPUT_NORM,
965+
MODEL_TENSOR.OUTPUT,
966+
MODEL_TENSOR.ATTN_NORM,
967+
MODEL_TENSOR.ATTN_Q,
968+
MODEL_TENSOR.ATTN_K,
969+
MODEL_TENSOR.ATTN_V,
970+
MODEL_TENSOR.ATTN_OUT,
971+
MODEL_TENSOR.FFN_GATE_INP,
972+
MODEL_TENSOR.FFN_NORM,
973+
MODEL_TENSOR.FFN_GATE,
974+
MODEL_TENSOR.FFN_DOWN,
975+
MODEL_TENSOR.FFN_UP,
976+
MODEL_TENSOR.FFN_GATE_EXP,
977+
MODEL_TENSOR.FFN_DOWN_EXP,
978+
MODEL_TENSOR.FFN_UP_EXP,
979+
MODEL_TENSOR.FFN_GATE_SHEXP,
980+
MODEL_TENSOR.FFN_DOWN_SHEXP,
981+
MODEL_TENSOR.FFN_UP_SHEXP,
982+
],
960983
MODEL_ARCH.DEEPSEEK2: [
961984
MODEL_TENSOR.TOKEN_EMBD,
962985
MODEL_TENSOR.OUTPUT_NORM,

0 commit comments

Comments
 (0)