23
23
sys .path .insert (1 , str (Path (__file__ ).parent / 'gguf-py' ))
24
24
import gguf
25
25
26
- from convert import HfVocab
26
+ from convert import LlamaHfVocab
27
27
28
28
29
29
###### MODEL DEFINITIONS ######
@@ -230,7 +230,7 @@ def _get_part_names(self):
230
230
def _set_vocab_gpt2 (self ):
231
231
dir_model = self .dir_model
232
232
hparams = self .hparams
233
- tokens : list [bytearray ] = []
233
+ tokens : list [str ] = []
234
234
toktypes : list [int ] = []
235
235
236
236
from transformers import AutoTokenizer
@@ -243,8 +243,7 @@ def _set_vocab_gpt2(self):
243
243
244
244
for i in range (vocab_size ):
245
245
if i not in reverse_vocab :
246
- pad_token = f"[PAD{ i } ]" .encode ('utf-8' )
247
- tokens .append (bytearray (pad_token ))
246
+ tokens .append (f"[PAD{ i } ]" )
248
247
toktypes .append (gguf .TokenType .USER_DEFINED )
249
248
elif reverse_vocab [i ] in added_vocab :
250
249
tokens .append (reverse_vocab [i ])
@@ -266,7 +265,7 @@ def _set_vocab_gpt2(self):
266
265
def _set_vocab_qwen (self ):
267
266
dir_model = self .dir_model
268
267
hparams = self .hparams
269
- tokens : list [bytearray ] = []
268
+ tokens : list [str ] = []
270
269
toktypes : list [int ] = []
271
270
272
271
from transformers import AutoTokenizer
@@ -291,8 +290,7 @@ def _set_vocab_qwen(self):
291
290
292
291
for i in range (vocab_size ):
293
292
if i not in reverse_vocab :
294
- pad_token = f"[PAD{ i } ]" .encode ("utf-8" )
295
- tokens .append (bytearray (pad_token ))
293
+ tokens .append (f"[PAD{ i } ]" )
296
294
toktypes .append (gguf .TokenType .USER_DEFINED )
297
295
elif reverse_vocab [i ] in added_vocab :
298
296
tokens .append (reverse_vocab [i ])
@@ -372,12 +370,8 @@ def _set_vocab_sentencepiece(self):
372
370
special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
373
371
special_vocab .add_to_gguf (self .gguf_writer )
374
372
375
- def _set_vocab_hf (self ):
376
- path = self .dir_model
377
- added_tokens_path = self .dir_model
378
- vocab = HfVocab (
379
- path , added_tokens_path if added_tokens_path .exists () else None
380
- )
373
+ def _set_vocab_llama_hf (self ):
374
+ vocab = LlamaHfVocab (self .dir_model )
381
375
tokens = []
382
376
scores = []
383
377
toktypes = []
@@ -1099,7 +1093,7 @@ def set_gguf_parameters(self):
1099
1093
self .gguf_writer .add_file_type (self .ftype )
1100
1094
1101
1095
def set_vocab (self ):
1102
- self ._set_vocab_hf ()
1096
+ self ._set_vocab_llama_hf ()
1103
1097
1104
1098
def _reverse_hf_permute (self , weights : Tensor , n_head : int , n_kv_head : int | None = None ) -> Tensor :
1105
1099
if n_kv_head is not None and n_head != n_kv_head :
@@ -1700,11 +1694,8 @@ def set_gguf_parameters(self):
1700
1694
self .gguf_writer .add_pooling_type (pooling_type )
1701
1695
1702
1696
def set_vocab (self ):
1703
- path = self .dir_model
1704
- added_tokens_path = self .dir_model if self .dir_model .exists () else None
1705
-
1706
1697
# use huggingface vocab to get all tokens
1707
- vocab = HfVocab ( path , added_tokens_path )
1698
+ vocab = LlamaHfVocab ( self . dir_model , ignore_nonllama = True )
1708
1699
tokens , scores , toktypes = zip (* vocab .all_tokens ())
1709
1700
assert len (tokens ) == vocab .vocab_size
1710
1701
self .vocab_size = vocab .vocab_size
0 commit comments