@@ -338,7 +338,12 @@ def _set_vocab_sentencepiece(self):
338
338
tokenizer = SentencePieceProcessor (str (tokenizer_path ))
339
339
vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
340
340
341
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
342
+ scores : list [float ] = [- 10000.0 ] * vocab_size
343
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
344
+
341
345
for token_id in range (tokenizer .vocab_size ()):
346
+
342
347
piece = tokenizer .id_to_piece (token_id )
343
348
text = piece .encode ("utf-8" )
344
349
score = tokenizer .get_score (token_id )
@@ -353,30 +358,24 @@ def _set_vocab_sentencepiece(self):
353
358
elif tokenizer .is_byte (token_id ):
354
359
toktype = SentencePieceTokenTypes .BYTE
355
360
356
- tokens . append ( text )
357
- scores . append ( score )
358
- toktypes . append ( toktype )
361
+ tokens [ token_id ] = text
362
+ scores [ token_id ] = score
363
+ toktypes [ token_id ] = toktype
359
364
360
365
added_tokens_file = self .dir_model / 'added_tokens.json'
361
366
if added_tokens_file .is_file ():
362
367
with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
363
368
added_tokens_json = json .load (f )
364
369
365
370
for key in added_tokens_json :
366
- key = key .encode ("utf-8" )
367
- if key not in tokens :
368
- tokens .append (key )
369
- scores .append (- 1000.0 )
370
- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
371
-
372
- # pad remaining tokens
373
- for i in range (vocab_size - len (tokens )):
374
- print (f"gguf: padding token { i } " )
375
- tokens .append (f"[PAD{ i } ]" )
376
- scores .append (- 1000.0 )
377
- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
378
-
379
- assert len (tokens ) == vocab_size
371
+ token_id = added_tokens_json [key ]
372
+ if (token_id >= vocab_size ):
373
+ print (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
374
+ continue
375
+
376
+ tokens [token_id ] = key .encode ("utf-8" )
377
+ scores [token_id ] = - 1000.0
378
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
380
379
381
380
self .gguf_writer .add_tokenizer_model ("llama" )
382
381
self .gguf_writer .add_token_list (tokens )
0 commit comments