@@ -93,6 +93,7 @@ class MODEL_ARCH(IntEnum):
93
93
BLOOM = auto ()
94
94
STABLELM = auto ()
95
95
QWEN = auto ()
96
+ MOE = auto ()
96
97
97
98
98
99
class MODEL_TENSOR (IntEnum ):
@@ -117,6 +118,30 @@ class MODEL_TENSOR(IntEnum):
117
118
FFN_NORM = auto ()
118
119
ATTN_Q_NORM = auto ()
119
120
ATTN_K_NORM = auto ()
121
+ FFN_EXPERT_0_W1 = auto ()
122
+ FFN_EXPERT_0_W2 = auto ()
123
+ FFN_EXPERT_0_W3 = auto ()
124
+ FFN_EXPERT_1_W1 = auto ()
125
+ FFN_EXPERT_1_W2 = auto ()
126
+ FFN_EXPERT_1_W3 = auto ()
127
+ FFN_EXPERT_2_W1 = auto ()
128
+ FFN_EXPERT_2_W2 = auto ()
129
+ FFN_EXPERT_2_W3 = auto ()
130
+ FFN_EXPERT_3_W1 = auto ()
131
+ FFN_EXPERT_3_W2 = auto ()
132
+ FFN_EXPERT_3_W3 = auto ()
133
+ FFN_EXPERT_4_W1 = auto ()
134
+ FFN_EXPERT_4_W2 = auto ()
135
+ FFN_EXPERT_4_W3 = auto ()
136
+ FFN_EXPERT_5_W1 = auto ()
137
+ FFN_EXPERT_5_W2 = auto ()
138
+ FFN_EXPERT_5_W3 = auto ()
139
+ FFN_EXPERT_6_W1 = auto ()
140
+ FFN_EXPERT_6_W2 = auto ()
141
+ FFN_EXPERT_6_W3 = auto ()
142
+ FFN_EXPERT_7_W1 = auto ()
143
+ FFN_EXPERT_7_W2 = auto ()
144
+ FFN_EXPERT_7_W3 = auto ()
120
145
121
146
122
147
MODEL_ARCH_NAMES : dict [MODEL_ARCH , str ] = {
@@ -134,6 +159,7 @@ class MODEL_TENSOR(IntEnum):
134
159
MODEL_ARCH .BLOOM : "bloom" ,
135
160
MODEL_ARCH .STABLELM : "stablelm" ,
136
161
MODEL_ARCH .QWEN : "qwen" ,
162
+ MODEL_ARCH .MOE : "moe" ,
137
163
}
138
164
139
165
TENSOR_NAMES : dict [MODEL_TENSOR , str ] = {
@@ -158,6 +184,30 @@ class MODEL_TENSOR(IntEnum):
158
184
MODEL_TENSOR .FFN_GATE : "blk.{bid}.ffn_gate" ,
159
185
MODEL_TENSOR .FFN_DOWN : "blk.{bid}.ffn_down" ,
160
186
MODEL_TENSOR .FFN_UP : "blk.{bid}.ffn_up" ,
187
+ MODEL_TENSOR .FFN_EXPERT_0_W1 : "layers.{bid}.feed_forward.experts.0.w1" ,
188
+ MODEL_TENSOR .FFN_EXPERT_0_W2 : "layers.{bid}.feed_forward.experts.0.w2" ,
189
+ MODEL_TENSOR .FFN_EXPERT_0_W3 : "layers.{bid}.feed_forward.experts.0.w3" ,
190
+ MODEL_TENSOR .FFN_EXPERT_1_W1 : "layers.{bid}.feed_forward.experts.1.w1" ,
191
+ MODEL_TENSOR .FFN_EXPERT_1_W2 : "layers.{bid}.feed_forward.experts.1.w2" ,
192
+ MODEL_TENSOR .FFN_EXPERT_1_W3 : "layers.{bid}.feed_forward.experts.1.w3" ,
193
+ MODEL_TENSOR .FFN_EXPERT_2_W1 : "layers.{bid}.feed_forward.experts.2.w1" ,
194
+ MODEL_TENSOR .FFN_EXPERT_2_W2 : "layers.{bid}.feed_forward.experts.2.w2" ,
195
+ MODEL_TENSOR .FFN_EXPERT_2_W3 : "layers.{bid}.feed_forward.experts.2.w3" ,
196
+ MODEL_TENSOR .FFN_EXPERT_3_W1 : "layers.{bid}.feed_forward.experts.3.w1" ,
197
+ MODEL_TENSOR .FFN_EXPERT_3_W2 : "layers.{bid}.feed_forward.experts.3.w2" ,
198
+ MODEL_TENSOR .FFN_EXPERT_3_W3 : "layers.{bid}.feed_forward.experts.3.w3" ,
199
+ MODEL_TENSOR .FFN_EXPERT_4_W1 : "layers.{bid}.feed_forward.experts.4.w1" ,
200
+ MODEL_TENSOR .FFN_EXPERT_4_W2 : "layers.{bid}.feed_forward.experts.4.w2" ,
201
+ MODEL_TENSOR .FFN_EXPERT_4_W3 : "layers.{bid}.feed_forward.experts.4.w3" ,
202
+ MODEL_TENSOR .FFN_EXPERT_5_W1 : "layers.{bid}.feed_forward.experts.5.w1" ,
203
+ MODEL_TENSOR .FFN_EXPERT_5_W2 : "layers.{bid}.feed_forward.experts.5.w2" ,
204
+ MODEL_TENSOR .FFN_EXPERT_5_W3 : "layers.{bid}.feed_forward.experts.5.w3" ,
205
+ MODEL_TENSOR .FFN_EXPERT_6_W1 : "layers.{bid}.feed_forward.experts.6.w1" ,
206
+ MODEL_TENSOR .FFN_EXPERT_6_W2 : "layers.{bid}.feed_forward.experts.6.w2" ,
207
+ MODEL_TENSOR .FFN_EXPERT_6_W3 : "layers.{bid}.feed_forward.experts.6.w3" ,
208
+ MODEL_TENSOR .FFN_EXPERT_7_W1 : "layers.{bid}.feed_forward.experts.7.w1" ,
209
+ MODEL_TENSOR .FFN_EXPERT_7_W2 : "layers.{bid}.feed_forward.experts.7.w2" ,
210
+ MODEL_TENSOR .FFN_EXPERT_7_W3 : "layers.{bid}.feed_forward.experts.7.w3" ,
161
211
}
162
212
163
213
MODEL_TENSORS : dict [MODEL_ARCH , list [MODEL_TENSOR ]] = {
@@ -333,6 +383,42 @@ class MODEL_TENSOR(IntEnum):
333
383
MODEL_TENSOR .FFN_DOWN ,
334
384
MODEL_TENSOR .FFN_UP ,
335
385
],
386
+ MODEL_ARCH .MOE : [
387
+ MODEL_TENSOR .TOKEN_EMBD ,
388
+ MODEL_TENSOR .OUTPUT_NORM ,
389
+ MODEL_TENSOR .OUTPUT ,
390
+ MODEL_TENSOR .ATTN_NORM ,
391
+ MODEL_TENSOR .ATTN_Q ,
392
+ MODEL_TENSOR .ATTN_K ,
393
+ MODEL_TENSOR .ATTN_V ,
394
+ MODEL_TENSOR .ATTN_OUT ,
395
+ MODEL_TENSOR .FFN_NORM ,
396
+ MODEL_TENSOR .FFN_GATE ,
397
+ MODEL_TENSOR .FFN_EXPERT_0_W1 ,
398
+ MODEL_TENSOR .FFN_EXPERT_0_W2 ,
399
+ MODEL_TENSOR .FFN_EXPERT_0_W3 ,
400
+ MODEL_TENSOR .FFN_EXPERT_1_W1 ,
401
+ MODEL_TENSOR .FFN_EXPERT_1_W2 ,
402
+ MODEL_TENSOR .FFN_EXPERT_1_W3 ,
403
+ MODEL_TENSOR .FFN_EXPERT_2_W1 ,
404
+ MODEL_TENSOR .FFN_EXPERT_2_W2 ,
405
+ MODEL_TENSOR .FFN_EXPERT_2_W3 ,
406
+ MODEL_TENSOR .FFN_EXPERT_3_W1 ,
407
+ MODEL_TENSOR .FFN_EXPERT_3_W2 ,
408
+ MODEL_TENSOR .FFN_EXPERT_3_W3 ,
409
+ MODEL_TENSOR .FFN_EXPERT_4_W1 ,
410
+ MODEL_TENSOR .FFN_EXPERT_4_W2 ,
411
+ MODEL_TENSOR .FFN_EXPERT_4_W3 ,
412
+ MODEL_TENSOR .FFN_EXPERT_5_W1 ,
413
+ MODEL_TENSOR .FFN_EXPERT_5_W2 ,
414
+ MODEL_TENSOR .FFN_EXPERT_5_W3 ,
415
+ MODEL_TENSOR .FFN_EXPERT_6_W1 ,
416
+ MODEL_TENSOR .FFN_EXPERT_6_W2 ,
417
+ MODEL_TENSOR .FFN_EXPERT_6_W3 ,
418
+ MODEL_TENSOR .FFN_EXPERT_7_W1 ,
419
+ MODEL_TENSOR .FFN_EXPERT_7_W2 ,
420
+ MODEL_TENSOR .FFN_EXPERT_7_W3 ,
421
+ ],
336
422
MODEL_ARCH .GPT2 : [
337
423
# TODO
338
424
],
0 commit comments