@@ -62,6 +62,7 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian:
62
62
def model_arch (self ) -> gguf .MODEL_ARCH :
63
63
pass
64
64
65
+ # TODO: add "default" argument
65
66
def find_hparam (self , keys : Sequence [str ], optional : bool = False ) -> Any :
66
67
key = next ((k for k in keys if k in self .hparams ), None )
67
68
if key is not None :
@@ -89,7 +90,12 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
89
90
yield name , data
90
91
91
92
def set_gguf_parameters (self ):
92
- self .gguf_writer .add_name (self .dir_model .name )
93
+ if (mtype := self .find_hparam (["model_type" ], optional = True )) is not None :
94
+ self .gguf_writer .add_name (mtype )
95
+ print (f"gguf: model type = { mtype } " )
96
+ else :
97
+ self .gguf_writer .add_name (self .dir_model .name )
98
+
93
99
self .gguf_writer .add_block_count (self .block_count )
94
100
95
101
if (n_ctx := self .find_hparam (["max_position_embeddings" , "n_ctx" ], optional = True )) is not None :
@@ -363,6 +369,13 @@ def _set_vocab_sentencepiece(self):
363
369
scores .append (- 1000.0 )
364
370
toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
365
371
372
+ # pad remaining tokens
373
+ for i in range (vocab_size - len (tokens )):
374
+ print (f"gguf: padding token { i } " )
375
+ tokens .append (f"[PAD{ i } ]" )
376
+ scores .append (- 1000.0 )
377
+ toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
378
+
366
379
assert len (tokens ) == vocab_size
367
380
368
381
self .gguf_writer .add_tokenizer_model ("llama" )
@@ -1293,7 +1306,7 @@ def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, l
1293
1306
self .gguf_writer .add_tensor (new_name , data )
1294
1307
1295
1308
1296
- @Model .register ("LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" )
1309
+ @Model .register ("LlamaForCausalLM" , "MistralForCausalLM" , "MixtralForCausalLM" , "Phi3ForCausalLM" )
1297
1310
class LlamaModel (Model ):
1298
1311
model_arch = gguf .MODEL_ARCH .LLAMA
1299
1312
@@ -1322,18 +1335,39 @@ def set_vocab(self):
1322
1335
def set_gguf_parameters (self ):
1323
1336
super ().set_gguf_parameters ()
1324
1337
hparams = self .hparams
1338
+
1325
1339
self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
1326
1340
self .gguf_writer .add_rope_dimension_count (hparams ["hidden_size" ] // hparams ["num_attention_heads" ])
1327
1341
1328
1342
# Same as super class, but permuting q_proj, k_proj
1329
1343
def write_tensors (self ):
1330
1344
block_count = self .hparams .get ("n_layers" , self .hparams .get ("num_hidden_layers" , self .hparams .get ("n_layer" )))
1331
1345
tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
1346
+ n_embd = self .hparams .get ("hidden_size" )
1332
1347
n_head = self .hparams .get ("num_attention_heads" )
1333
1348
n_kv_head = self .hparams .get ("num_key_value_heads" )
1334
1349
n_experts = self .hparams .get ("num_local_experts" )
1335
1350
experts = dict ()
1336
- for name , data_torch in self .get_tensors ():
1351
+
1352
+ head_dim = n_embd // n_head
1353
+
1354
+ tensors = dict (self .get_tensors ())
1355
+ for i in range (block_count ):
1356
+ # Phi-3 transformations
1357
+ # ref: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/8b29aca7bb785d6336fc19819b045bc7bc584b06/modeling_phi3.py#L379-L384
1358
+ if (w := tensors .get (f"model.layers.{ i } .self_attn.qkv_proj.weight" )) is not None :
1359
+ qpos = n_head * head_dim
1360
+ tensors [f"model.layers.{ i } .self_attn.q_proj.weight" ] = w [:qpos ]
1361
+ tensors [f"model.layers.{ i } .self_attn.k_proj.weight" ] = w [qpos :qpos + n_kv_head * head_dim ]
1362
+ tensors [f"model.layers.{ i } .self_attn.v_proj.weight" ] = w [qpos + n_kv_head * head_dim :]
1363
+ del tensors [f"model.layers.{ i } .self_attn.qkv_proj.weight" ]
1364
+ if (w := tensors .get (f"model.layers.{ i } .mlp.gate_up_proj.weight" )) is not None :
1365
+ ff_dim = w .shape [0 ] // 2
1366
+ tensors [f"model.layers.{ i } .mlp.gate_proj.weight" ] = w [:ff_dim ]
1367
+ tensors [f"model.layers.{ i } .mlp.up_proj.weight" ] = w [ff_dim :]
1368
+ del tensors [f"model.layers.{ i } .mlp.gate_up_proj.weight" ]
1369
+
1370
+ for name , data_torch in tensors .items ():
1337
1371
# we don't need these
1338
1372
if name .endswith ((".attention.masked_bias" , ".attention.bias" , ".attention.rotary_emb.inv_freq" )):
1339
1373
continue
0 commit comments