@@ -753,7 +753,8 @@ def _create_vocab_sentencepiece(self):
753
753
token_id = int (token_id )
754
754
token : str = token_data ["content" ]
755
755
if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
756
- assert tokens [token_id ] == token .encode ("utf-8" )
756
+ if tokens [token_id ] != token .encode ("utf-8" ):
757
+ logger .warning (f'replacing token { token_id } : { tokens [token_id ].decode ("utf-8" )!r} -> { token !r} ' )
757
758
if token_data .get ("special" ) or self .does_token_look_special (token ):
758
759
toktypes [token_id ] = SentencePieceTokenTypes .CONTROL
759
760
else :
@@ -1312,6 +1313,7 @@ def set_vocab(self):
1312
1313
special_vocab ._set_special_token ("prefix" , 1 )
1313
1314
special_vocab ._set_special_token ("suffix" , 3 )
1314
1315
special_vocab ._set_special_token ("middle" , 2 )
1316
+ special_vocab .chat_template = None # do not add it twice
1315
1317
special_vocab .add_to_gguf (self .gguf_writer )
1316
1318
1317
1319
def set_gguf_parameters (self ):
@@ -2014,7 +2016,8 @@ def set_vocab(self):
2014
2016
token_id = int (token_id )
2015
2017
token = foken_data ["content" ].encode ("utf-8" )
2016
2018
if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2017
- assert tokens [token_id ] == token
2019
+ if tokens [token_id ] != token :
2020
+ logger .warning (f'replacing token { token_id } : { tokens [token_id ].decode ("utf-8" )!r} -> { token .decode ("utf-8" )!r} ' )
2018
2021
tokens [token_id ] = token
2019
2022
scores [token_id ] = - 1000.0
2020
2023
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
@@ -2030,7 +2033,8 @@ def set_vocab(self):
2030
2033
token_id = int (foken_data ["id" ])
2031
2034
token = foken_data ["content" ].encode ("utf-8" )
2032
2035
if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2033
- assert tokens [token_id ] == token
2036
+ if tokens [token_id ] != token :
2037
+ logger .warning (f'replacing token { token_id } : { tokens [token_id ].decode ("utf-8" )!r} -> { token .decode ("utf-8" )!r} ' )
2034
2038
tokens [token_id ] = token
2035
2039
scores [token_id ] = - 1000.0
2036
2040
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
@@ -2269,7 +2273,8 @@ def set_vocab(self):
2269
2273
chat_eos_token_id = token_id
2270
2274
token = token .encode ("utf-8" )
2271
2275
if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2272
- assert (tokens [token_id ] == token )
2276
+ if tokens [token_id ] != token :
2277
+ logger .warning (f'replacing token { token_id } : { tokens [token_id ].decode ("utf-8" )!r} -> { token .decode ("utf-8" )!r} ' )
2273
2278
tokens [token_id ] = token
2274
2279
scores [token_id ] = - 1000.0
2275
2280
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
@@ -2288,7 +2293,8 @@ def set_vocab(self):
2288
2293
chat_eos_token_id = token_id
2289
2294
token = token .encode ("utf-8" )
2290
2295
if toktypes [token_id ] != SentencePieceTokenTypes .UNUSED :
2291
- assert (tokens [token_id ] == token )
2296
+ if tokens [token_id ] != token :
2297
+ logger .warning (f'replacing token { token_id } : { tokens [token_id ].decode ("utf-8" )!r} -> { token .decode ("utf-8" )!r} ' )
2292
2298
tokens [token_id ] = token
2293
2299
scores [token_id ] = - 1000.0
2294
2300
toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
@@ -2474,6 +2480,7 @@ def set_vocab(self):
2474
2480
special_vocab ._set_special_token ("middle" , 68 )
2475
2481
special_vocab ._set_special_token ("fsep" , 70 )
2476
2482
special_vocab ._set_special_token ("eot" , 107 )
2483
+ special_vocab .chat_template = None # do not add it twice
2477
2484
special_vocab .add_to_gguf (self .gguf_writer )
2478
2485
2479
2486
self .gguf_writer .add_add_space_prefix (False )
0 commit comments