@@ -338,7 +338,12 @@ def _set_vocab_sentencepiece(self):
338338 tokenizer = SentencePieceProcessor (str (tokenizer_path ))
339339 vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
340340
341+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
342+ scores : list [float ] = [- 10000.0 ] * vocab_size
343+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
344+
341345 for token_id in range (tokenizer .vocab_size ()):
346+
342347 piece = tokenizer .id_to_piece (token_id )
343348 text = piece .encode ("utf-8" )
344349 score = tokenizer .get_score (token_id )
@@ -353,30 +358,24 @@ def _set_vocab_sentencepiece(self):
353358 elif tokenizer .is_byte (token_id ):
354359 toktype = SentencePieceTokenTypes .BYTE
355360
356- tokens . append ( text )
357- scores . append ( score )
358- toktypes . append ( toktype )
361+ tokens [ token_id ] = text
362+ scores [ token_id ] = score
363+ toktypes [ token_id ] = toktype
359364
360365 added_tokens_file = self .dir_model / 'added_tokens.json'
361366 if added_tokens_file .is_file ():
362367 with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
363368 added_tokens_json = json .load (f )
364369
365370 for key in added_tokens_json :
366- key = key .encode ("utf-8" )
367- if key not in tokens :
368- tokens .append (key )
369- scores .append (- 1000.0 )
370- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
371-
372- # pad remaining tokens
373- for i in range (vocab_size - len (tokens )):
374- print (f"gguf: padding token { i } " )
375- tokens .append (f"[PAD{ i } ]" )
376- scores .append (- 1000.0 )
377- toktypes .append (SentencePieceTokenTypes .USER_DEFINED )
378-
379- assert len (tokens ) == vocab_size
371+ token_id = added_tokens_json [key ]
372+ if (token_id >= vocab_size ):
373+ print (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
374+ continue
375+
376+ tokens [token_id ] = key .encode ("utf-8" )
377+ scores [token_id ] = - 1000.0
378+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
380379
381380 self .gguf_writer .add_tokenizer_model ("llama" )
382381 self .gguf_writer .add_token_list (tokens )
0 commit comments