Skip to content

Commit 2dd8944

Browse files
committed
hacked together mixtra-moe conversion script
1 parent bcc0eb4 commit 2dd8944

File tree

3 files changed

+187
-3
lines changed

3 files changed

+187
-3
lines changed

convert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939

4040
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
4141

42-
ARCH = gguf.MODEL_ARCH.LLAMA
42+
ARCH = gguf.MODEL_ARCH.MOE
4343

4444
DEFAULT_CONCURRENCY = 8
4545
#
@@ -241,7 +241,7 @@ def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
241241
n_ff = config["intermediate_size"],
242242
n_head = (n_head := config["num_attention_heads"]),
243243
n_head_kv = config.get("num_key_value_heads", n_head),
244-
f_norm_eps = config["rms_norm_eps"],
244+
f_norm_eps = config["norm_eps"],
245245
f_rope_freq_base = config.get("rope_theta"),
246246
rope_scaling_type = rope_scaling_type,
247247
f_rope_scale = f_rope_scale,
@@ -271,7 +271,7 @@ def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
271271
n_embd = config["dim"],
272272
n_layer = config["n_layers"],
273273
n_ctx = n_ctx,
274-
n_ff = model["layers.0.feed_forward.w1.weight"].shape[0],
274+
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0],
275275
n_head = (n_head := config["n_heads"]),
276276
n_head_kv = config.get("n_kv_heads", n_head),
277277
f_norm_eps = config["norm_eps"],

gguf-py/gguf/constants.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ class MODEL_ARCH(IntEnum):
9393
BLOOM = auto()
9494
STABLELM = auto()
9595
QWEN = auto()
96+
MOE = auto()
9697

9798

9899
class MODEL_TENSOR(IntEnum):
@@ -117,6 +118,30 @@ class MODEL_TENSOR(IntEnum):
117118
FFN_NORM = auto()
118119
ATTN_Q_NORM = auto()
119120
ATTN_K_NORM = auto()
121+
FFN_EXPERT_0_W1 = auto()
122+
FFN_EXPERT_0_W2 = auto()
123+
FFN_EXPERT_0_W3 = auto()
124+
FFN_EXPERT_1_W1 = auto()
125+
FFN_EXPERT_1_W2 = auto()
126+
FFN_EXPERT_1_W3 = auto()
127+
FFN_EXPERT_2_W1 = auto()
128+
FFN_EXPERT_2_W2 = auto()
129+
FFN_EXPERT_2_W3 = auto()
130+
FFN_EXPERT_3_W1 = auto()
131+
FFN_EXPERT_3_W2 = auto()
132+
FFN_EXPERT_3_W3 = auto()
133+
FFN_EXPERT_4_W1 = auto()
134+
FFN_EXPERT_4_W2 = auto()
135+
FFN_EXPERT_4_W3 = auto()
136+
FFN_EXPERT_5_W1 = auto()
137+
FFN_EXPERT_5_W2 = auto()
138+
FFN_EXPERT_5_W3 = auto()
139+
FFN_EXPERT_6_W1 = auto()
140+
FFN_EXPERT_6_W2 = auto()
141+
FFN_EXPERT_6_W3 = auto()
142+
FFN_EXPERT_7_W1 = auto()
143+
FFN_EXPERT_7_W2 = auto()
144+
FFN_EXPERT_7_W3 = auto()
120145

121146

122147
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -134,6 +159,7 @@ class MODEL_TENSOR(IntEnum):
134159
MODEL_ARCH.BLOOM: "bloom",
135160
MODEL_ARCH.STABLELM: "stablelm",
136161
MODEL_ARCH.QWEN: "qwen",
162+
MODEL_ARCH.MOE: "moe",
137163
}
138164

139165
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -158,6 +184,30 @@ class MODEL_TENSOR(IntEnum):
158184
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
159185
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
160186
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
187+
MODEL_TENSOR.FFN_EXPERT_0_W1: "layers.{bid}.feed_forward.experts.0.w1",
188+
MODEL_TENSOR.FFN_EXPERT_0_W2: "layers.{bid}.feed_forward.experts.0.w2",
189+
MODEL_TENSOR.FFN_EXPERT_0_W3: "layers.{bid}.feed_forward.experts.0.w3",
190+
MODEL_TENSOR.FFN_EXPERT_1_W1: "layers.{bid}.feed_forward.experts.1.w1",
191+
MODEL_TENSOR.FFN_EXPERT_1_W2: "layers.{bid}.feed_forward.experts.1.w2",
192+
MODEL_TENSOR.FFN_EXPERT_1_W3: "layers.{bid}.feed_forward.experts.1.w3",
193+
MODEL_TENSOR.FFN_EXPERT_2_W1: "layers.{bid}.feed_forward.experts.2.w1",
194+
MODEL_TENSOR.FFN_EXPERT_2_W2: "layers.{bid}.feed_forward.experts.2.w2",
195+
MODEL_TENSOR.FFN_EXPERT_2_W3: "layers.{bid}.feed_forward.experts.2.w3",
196+
MODEL_TENSOR.FFN_EXPERT_3_W1: "layers.{bid}.feed_forward.experts.3.w1",
197+
MODEL_TENSOR.FFN_EXPERT_3_W2: "layers.{bid}.feed_forward.experts.3.w2",
198+
MODEL_TENSOR.FFN_EXPERT_3_W3: "layers.{bid}.feed_forward.experts.3.w3",
199+
MODEL_TENSOR.FFN_EXPERT_4_W1: "layers.{bid}.feed_forward.experts.4.w1",
200+
MODEL_TENSOR.FFN_EXPERT_4_W2: "layers.{bid}.feed_forward.experts.4.w2",
201+
MODEL_TENSOR.FFN_EXPERT_4_W3: "layers.{bid}.feed_forward.experts.4.w3",
202+
MODEL_TENSOR.FFN_EXPERT_5_W1: "layers.{bid}.feed_forward.experts.5.w1",
203+
MODEL_TENSOR.FFN_EXPERT_5_W2: "layers.{bid}.feed_forward.experts.5.w2",
204+
MODEL_TENSOR.FFN_EXPERT_5_W3: "layers.{bid}.feed_forward.experts.5.w3",
205+
MODEL_TENSOR.FFN_EXPERT_6_W1: "layers.{bid}.feed_forward.experts.6.w1",
206+
MODEL_TENSOR.FFN_EXPERT_6_W2: "layers.{bid}.feed_forward.experts.6.w2",
207+
MODEL_TENSOR.FFN_EXPERT_6_W3: "layers.{bid}.feed_forward.experts.6.w3",
208+
MODEL_TENSOR.FFN_EXPERT_7_W1: "layers.{bid}.feed_forward.experts.7.w1",
209+
MODEL_TENSOR.FFN_EXPERT_7_W2: "layers.{bid}.feed_forward.experts.7.w2",
210+
MODEL_TENSOR.FFN_EXPERT_7_W3: "layers.{bid}.feed_forward.experts.7.w3",
161211
}
162212

163213
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -333,6 +383,42 @@ class MODEL_TENSOR(IntEnum):
333383
MODEL_TENSOR.FFN_DOWN,
334384
MODEL_TENSOR.FFN_UP,
335385
],
386+
MODEL_ARCH.MOE: [
387+
MODEL_TENSOR.TOKEN_EMBD,
388+
MODEL_TENSOR.OUTPUT_NORM,
389+
MODEL_TENSOR.OUTPUT,
390+
MODEL_TENSOR.ATTN_NORM,
391+
MODEL_TENSOR.ATTN_Q,
392+
MODEL_TENSOR.ATTN_K,
393+
MODEL_TENSOR.ATTN_V,
394+
MODEL_TENSOR.ATTN_OUT,
395+
MODEL_TENSOR.FFN_NORM,
396+
MODEL_TENSOR.FFN_GATE,
397+
MODEL_TENSOR.FFN_EXPERT_0_W1,
398+
MODEL_TENSOR.FFN_EXPERT_0_W2,
399+
MODEL_TENSOR.FFN_EXPERT_0_W3,
400+
MODEL_TENSOR.FFN_EXPERT_1_W1,
401+
MODEL_TENSOR.FFN_EXPERT_1_W2,
402+
MODEL_TENSOR.FFN_EXPERT_1_W3,
403+
MODEL_TENSOR.FFN_EXPERT_2_W1,
404+
MODEL_TENSOR.FFN_EXPERT_2_W2,
405+
MODEL_TENSOR.FFN_EXPERT_2_W3,
406+
MODEL_TENSOR.FFN_EXPERT_3_W1,
407+
MODEL_TENSOR.FFN_EXPERT_3_W2,
408+
MODEL_TENSOR.FFN_EXPERT_3_W3,
409+
MODEL_TENSOR.FFN_EXPERT_4_W1,
410+
MODEL_TENSOR.FFN_EXPERT_4_W2,
411+
MODEL_TENSOR.FFN_EXPERT_4_W3,
412+
MODEL_TENSOR.FFN_EXPERT_5_W1,
413+
MODEL_TENSOR.FFN_EXPERT_5_W2,
414+
MODEL_TENSOR.FFN_EXPERT_5_W3,
415+
MODEL_TENSOR.FFN_EXPERT_6_W1,
416+
MODEL_TENSOR.FFN_EXPERT_6_W2,
417+
MODEL_TENSOR.FFN_EXPERT_6_W3,
418+
MODEL_TENSOR.FFN_EXPERT_7_W1,
419+
MODEL_TENSOR.FFN_EXPERT_7_W2,
420+
MODEL_TENSOR.FFN_EXPERT_7_W3,
421+
],
336422
MODEL_ARCH.GPT2: [
337423
# TODO
338424
],

gguf-py/gguf/tensor_mapping.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ class TensorNameMap:
169169
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
170170
"layers.{bid}.feed_forward.w1", # llama-pth
171171
"transformer.h.{bid}.mlp.w2", # qwen
172+
"layers.{bid}.feed_forward.gate" # moe
172173
),
173174

174175
# Feed-forward down
@@ -196,6 +197,102 @@ class TensorNameMap:
196197
MODEL_TENSOR.ROPE_FREQS: (
197198
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
198199
),
200+
201+
MODEL_TENSOR.FFN_EXPERT_0_W1: (
202+
"layers.{bid}.feed_forward.experts.0.w1",
203+
),
204+
205+
MODEL_TENSOR.FFN_EXPERT_0_W2: (
206+
"layers.{bid}.feed_forward.experts.0.w2",
207+
),
208+
209+
MODEL_TENSOR.FFN_EXPERT_0_W3: (
210+
"layers.{bid}.feed_forward.experts.0.w3",
211+
),
212+
213+
MODEL_TENSOR.FFN_EXPERT_1_W1: (
214+
"layers.{bid}.feed_forward.experts.1.w1",
215+
),
216+
217+
MODEL_TENSOR.FFN_EXPERT_1_W2: (
218+
"layers.{bid}.feed_forward.experts.1.w2",
219+
),
220+
221+
MODEL_TENSOR.FFN_EXPERT_1_W3: (
222+
"layers.{bid}.feed_forward.experts.1.w3",
223+
),
224+
225+
MODEL_TENSOR.FFN_EXPERT_2_W1: (
226+
"layers.{bid}.feed_forward.experts.2.w1",
227+
),
228+
229+
MODEL_TENSOR.FFN_EXPERT_2_W2: (
230+
"layers.{bid}.feed_forward.experts.2.w2",
231+
),
232+
233+
MODEL_TENSOR.FFN_EXPERT_2_W3: (
234+
"layers.{bid}.feed_forward.experts.2.w3",
235+
),
236+
237+
MODEL_TENSOR.FFN_EXPERT_3_W1: (
238+
"layers.{bid}.feed_forward.experts.3.w1",
239+
),
240+
241+
MODEL_TENSOR.FFN_EXPERT_3_W2: (
242+
"layers.{bid}.feed_forward.experts.3.w2",
243+
),
244+
245+
MODEL_TENSOR.FFN_EXPERT_3_W3: (
246+
"layers.{bid}.feed_forward.experts.3.w3",
247+
),
248+
249+
MODEL_TENSOR.FFN_EXPERT_4_W1: (
250+
"layers.{bid}.feed_forward.experts.4.w1",
251+
),
252+
253+
MODEL_TENSOR.FFN_EXPERT_4_W2: (
254+
"layers.{bid}.feed_forward.experts.4.w2",
255+
),
256+
257+
MODEL_TENSOR.FFN_EXPERT_4_W3: (
258+
"layers.{bid}.feed_forward.experts.4.w3",
259+
),
260+
261+
MODEL_TENSOR.FFN_EXPERT_5_W1: (
262+
"layers.{bid}.feed_forward.experts.5.w1",
263+
),
264+
265+
MODEL_TENSOR.FFN_EXPERT_5_W2: (
266+
"layers.{bid}.feed_forward.experts.5.w2",
267+
),
268+
269+
MODEL_TENSOR.FFN_EXPERT_5_W3: (
270+
"layers.{bid}.feed_forward.experts.5.w3",
271+
),
272+
273+
MODEL_TENSOR.FFN_EXPERT_6_W1: (
274+
"layers.{bid}.feed_forward.experts.6.w1",
275+
),
276+
277+
MODEL_TENSOR.FFN_EXPERT_6_W2: (
278+
"layers.{bid}.feed_forward.experts.6.w2",
279+
),
280+
281+
MODEL_TENSOR.FFN_EXPERT_6_W3: (
282+
"layers.{bid}.feed_forward.experts.6.w3",
283+
),
284+
285+
MODEL_TENSOR.FFN_EXPERT_7_W1: (
286+
"layers.{bid}.feed_forward.experts.7.w1",
287+
),
288+
289+
MODEL_TENSOR.FFN_EXPERT_7_W2: (
290+
"layers.{bid}.feed_forward.experts.7.w2",
291+
),
292+
293+
MODEL_TENSOR.FFN_EXPERT_7_W3: (
294+
"layers.{bid}.feed_forward.experts.7.w3",
295+
),
199296
}
200297

201298
mapping: dict[str, tuple[MODEL_TENSOR, str]]
@@ -211,6 +308,7 @@ def __init__(self, arch: MODEL_ARCH, n_blocks: int):
211308
self.mapping[key] = (tensor, tensor_name)
212309
for bid in range(n_blocks):
213310
for tensor, keys in self.block_mappings_cfg.items():
311+
print(tensor, keys)
214312
if tensor not in MODEL_TENSORS[arch]:
215313
continue
216314
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)

0 commit comments

Comments
 (0)