ggml-org · Artefact2 · Jan 22, 2024 · slaren · Feb 3, 2024
diff --git a/convert.py b/convert.py
@@ -509,11 +509,13 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
 
             # Convert token text to bytes
             token_text = reverse_vocab[token_id].encode("utf-8")
+            if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
+                toktype = gguf.TokenType.BYTE
+            else:
+                toktype = self.get_token_type(token_id, self.special_ids)
 
             # Yield token text, score, and type
-            yield token_text, self.get_token_score(token_id), self.get_token_type(
-                token_id, self.special_ids  # Reuse already stored special IDs
-            )
+            yield token_text, self.get_token_score(token_id), toktype
 
     def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
         # Determine token type based on whether it's a special token