@@ -515,10 +515,14 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
515
515
516
516
# Yield token text, score, and type
517
517
yield token_text , self .get_token_score (token_id ), self .get_token_type (
518
- token_id , self .special_ids # Reuse already stored special IDs
518
+ token_id , token_text , self .special_ids # Reuse already stored special IDs
519
519
)
520
520
521
- def get_token_type (self , token_id : int , special_ids : set [int ]) -> gguf .TokenType :
521
+ def get_token_type (self , token_id : int , token_text : bytes , special_ids : set [int ]) -> gguf .TokenType :
522
+ # Special case for byte tokens
523
+ if re .fullmatch (br"<0x[0-9A-Fa-f]{2}>" , token_text ):
524
+ return gguf .TokenType .BYTE
525
+
522
526
# Determine token type based on whether it's a special token
523
527
return gguf .TokenType .CONTROL if token_id in special_ids else gguf .TokenType .NORMAL
524
528
@@ -530,7 +534,7 @@ def get_token_score(self, token_id: int) -> float:
530
534
def added_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
531
535
for text in self .added_tokens_list :
532
536
if text in self .specials :
533
- toktype = self .get_token_type (self .specials [text ], self .special_ids )
537
+ toktype = self .get_token_type (self .specials [text ], b'' , self .special_ids )
534
538
score = self .get_token_score (self .specials [text ])
535
539
else :
536
540
toktype = gguf .TokenType .USER_DEFINED
0 commit comments