@@ -357,6 +357,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
357357 for tok in self .tokenizer .all_special_tokens
358358 }
359359 self .special_ids : set [int ] = set (self .tokenizer .all_special_ids )
360+ self .reverse_vocab = {id : encoded_tok for encoded_tok , id in self .tokenizer .get_vocab ().items ()}
360361 self .vocab_size_base : int = self .tokenizer .vocab_size
361362 self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_dict )
362363 self .fname_tokenizer : Path = fname_tokenizer
@@ -371,14 +372,13 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
371372
372373 def hf_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
373374 tokenizer = self .tokenizer
374- reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .get_vocab ().items ()}
375375 added_tokens_ids = set (self .added_tokens_dict .values ())
376376
377377 for i in range (self .vocab_size_base ):
378378 if i in added_tokens_ids :
379379 continue
380380
381- text = reverse_vocab [i ].encode ("utf-8" )
381+ text = self . reverse_vocab [i ].encode ("utf-8" )
382382 yield text , self .get_token_score (i ), self .get_token_type (i )
383383
384384 def get_token_type (self , token_id : int ) -> gguf .TokenType :
@@ -394,10 +394,13 @@ def get_token_type(self, token_id: int) -> gguf.TokenType:
394394 if self .spm .is_byte (token_id ):
395395 toktype = gguf .TokenType .BYTE
396396 else :
397+ token = self .reverse_vocab [token_id ]
397398 if token_id == self .unk_token_id :
398399 toktype = gguf .TokenType .UNKNOWN
399- if token_id in self .special_ids :
400+ elif token_id in self .special_ids :
400401 toktype = gguf .TokenType .CONTROL
402+ elif len (token ) == 6 and token .startswith ("<0x" ) and token .endswith (">" ):
403+ toktype = gguf .TokenType .BYTE
401404
402405 return toktype
403406
0 commit comments