Skip to content

Commit c69c630

Browse files
authored
convert_hf : fix Gemma v1 conversion (#8597)
* convert_hf : fix Gemma v1 conversion * convert_hf : allow renaming tokens, but with a warning * convert_hf : fix Gemma v1 not setting BOS and EOS tokens
1 parent 69c487f commit c69c630

File tree

1 file changed

+12
-5
lines changed

1 file changed

+12
-5
lines changed

convert_hf_to_gguf.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,8 @@ def _create_vocab_sentencepiece(self):
753753
token_id = int(token_id)
754754
token: str = token_data["content"]
755755
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
756-
assert tokens[token_id] == token.encode("utf-8")
756+
if tokens[token_id] != token.encode("utf-8"):
757+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}')
757758
if token_data.get("special") or self.does_token_look_special(token):
758759
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
759760
else:
@@ -1312,6 +1313,7 @@ def set_vocab(self):
13121313
special_vocab._set_special_token("prefix", 1)
13131314
special_vocab._set_special_token("suffix", 3)
13141315
special_vocab._set_special_token("middle", 2)
1316+
special_vocab.chat_template = None # do not add it twice
13151317
special_vocab.add_to_gguf(self.gguf_writer)
13161318

13171319
def set_gguf_parameters(self):
@@ -2014,7 +2016,8 @@ def set_vocab(self):
20142016
token_id = int(token_id)
20152017
token = foken_data["content"].encode("utf-8")
20162018
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2017-
assert tokens[token_id] == token
2019+
if tokens[token_id] != token:
2020+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20182021
tokens[token_id] = token
20192022
scores[token_id] = -1000.0
20202023
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2030,7 +2033,8 @@ def set_vocab(self):
20302033
token_id = int(foken_data["id"])
20312034
token = foken_data["content"].encode("utf-8")
20322035
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2033-
assert tokens[token_id] == token
2036+
if tokens[token_id] != token:
2037+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
20342038
tokens[token_id] = token
20352039
scores[token_id] = -1000.0
20362040
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2269,7 +2273,8 @@ def set_vocab(self):
22692273
chat_eos_token_id = token_id
22702274
token = token.encode("utf-8")
22712275
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2272-
assert(tokens[token_id] == token)
2276+
if tokens[token_id] != token:
2277+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22732278
tokens[token_id] = token
22742279
scores[token_id] = -1000.0
22752280
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2288,7 +2293,8 @@ def set_vocab(self):
22882293
chat_eos_token_id = token_id
22892294
token = token.encode("utf-8")
22902295
if toktypes[token_id] != SentencePieceTokenTypes.UNUSED:
2291-
assert(tokens[token_id] == token)
2296+
if tokens[token_id] != token:
2297+
logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}')
22922298
tokens[token_id] = token
22932299
scores[token_id] = -1000.0
22942300
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -2474,6 +2480,7 @@ def set_vocab(self):
24742480
special_vocab._set_special_token("middle", 68)
24752481
special_vocab._set_special_token("fsep", 70)
24762482
special_vocab._set_special_token("eot", 107)
2483+
special_vocab.chat_template = None # do not add it twice
24772484
special_vocab.add_to_gguf(self.gguf_writer)
24782485

24792486
self.gguf_writer.add_add_space_prefix(False)

0 commit comments

Comments
 (0)