@@ -357,6 +357,7 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
357
357
for tok in self .tokenizer .all_special_tokens
358
358
}
359
359
self .special_ids : set [int ] = set (self .tokenizer .all_special_ids )
360
+ self .reverse_vocab = {id : encoded_tok for encoded_tok , id in self .tokenizer .get_vocab ().items ()}
360
361
self .vocab_size_base : int = self .tokenizer .vocab_size
361
362
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_dict )
362
363
self .fname_tokenizer : Path = fname_tokenizer
@@ -371,14 +372,13 @@ def __init__(self, params: Params, fname_tokenizer: Path) -> None:
371
372
372
373
def hf_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
373
374
tokenizer = self .tokenizer
374
- reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .get_vocab ().items ()}
375
375
added_tokens_ids = set (self .added_tokens_dict .values ())
376
376
377
377
for i in range (self .vocab_size_base ):
378
378
if i in added_tokens_ids :
379
379
continue
380
380
381
- text = reverse_vocab [i ].encode ("utf-8" )
381
+ text = self . reverse_vocab [i ].encode ("utf-8" )
382
382
yield text , self .get_token_score (i ), self .get_token_type (i )
383
383
384
384
def get_token_type (self , token_id : int ) -> gguf .TokenType :
@@ -394,10 +394,13 @@ def get_token_type(self, token_id: int) -> gguf.TokenType:
394
394
if self .spm .is_byte (token_id ):
395
395
toktype = gguf .TokenType .BYTE
396
396
else :
397
+ token = self .reverse_vocab [token_id ]
397
398
if token_id == self .unk_token_id :
398
399
toktype = gguf .TokenType .UNKNOWN
399
- if token_id in self .special_ids :
400
+ elif token_id in self .special_ids :
400
401
toktype = gguf .TokenType .CONTROL
402
+ elif len (token ) == 6 and token .startswith ("<0x" ) and token .endswith (">" ):
403
+ toktype = gguf .TokenType .BYTE
401
404
402
405
return toktype
403
406
0 commit comments