Skip to content

Commit d260ce7

Browse files
ggerganovjordankanter
authored andcommitted
py : handle byte tokens in get_token_type (ggml-org#5341)
* py : handle byte tokens in `get_token_type` * py : fix empty bytes arg
1 parent f44df47 commit d260ce7

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

convert.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,14 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
515515

516516
# Yield token text, score, and type
517517
yield token_text, self.get_token_score(token_id), self.get_token_type(
518-
token_id, self.special_ids # Reuse already stored special IDs
518+
token_id, token_text, self.special_ids # Reuse already stored special IDs
519519
)
520520

521-
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
521+
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
522+
# Special case for byte tokens
523+
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
524+
return gguf.TokenType.BYTE
525+
522526
# Determine token type based on whether it's a special token
523527
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
524528

@@ -530,7 +534,7 @@ def get_token_score(self, token_id: int) -> float:
530534
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
531535
for text in self.added_tokens_list:
532536
if text in self.specials:
533-
toktype = self.get_token_type(self.specials[text], self.special_ids)
537+
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
534538
score = self.get_token_score(self.specials[text])
535539
else:
536540
toktype = gguf.TokenType.USER_DEFINED

0 commit comments

Comments
 (0)