@@ -357,6 +357,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
357
357
for tok in self .tokenizer .all_special_tokens
358
358
}
359
359
self .special_ids : set [int ] = set (self .tokenizer .all_special_ids )
360
+ self .reverse_vocab = {id : encoded_tok for encoded_tok , id in self .tokenizer .get_vocab ().items ()}
360
361
self .vocab_size_base : int = self .tokenizer .vocab_size
361
362
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_dict )
362
363
self .fname_tokenizer : Path = fname_tokenizer
@@ -370,15 +371,13 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
370
371
self .spm = None
371
372
372
373
def hf_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
373
- tokenizer = self .tokenizer
374
- reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .get_vocab ().items ()}
375
374
added_tokens_ids = set (self .added_tokens_dict .values ())
376
375
377
376
for i in range (self .vocab_size_base ):
378
377
if i in added_tokens_ids :
379
378
continue
380
379
381
- text = reverse_vocab [i ].encode ("utf-8" )
380
+ text = self . reverse_vocab [i ].encode ("utf-8" )
382
381
yield text , self .get_token_score (i ), self .get_token_type (i )
383
382
384
383
def get_token_type (self , token_id : int ) -> gguf .TokenType :
@@ -394,10 +393,13 @@ def get_token_type(self, token_id: int) -> gguf.TokenType:
394
393
if self .spm .is_byte (token_id ):
395
394
toktype = gguf .TokenType .BYTE
396
395
else :
396
+ token = self .reverse_vocab [token_id ]
397
397
if token_id == self .unk_token_id :
398
398
toktype = gguf .TokenType .UNKNOWN
399
- if token_id in self .special_ids :
399
+ elif token_id in self .special_ids :
400
400
toktype = gguf .TokenType .CONTROL
401
+ elif len (token ) == 6 and token .startswith ("<0x" ) and token .endswith (">" ):
402
+ toktype = gguf .TokenType .BYTE
401
403
402
404
return toktype
403
405
0 commit comments