@@ -366,16 +366,19 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
366
366
added_tokens = {}
367
367
368
368
vocab_size : int = self .sentencepiece_tokenizer .vocab_size ()
369
- expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
370
- actual_ids = sorted (added_tokens .values ())
371
- if expected_ids != actual_ids :
372
- raise Exception (f"Expected added token IDs to be sequential and start at { vocab_size } ; got { actual_ids } " )
373
369
374
- items = sorted (added_tokens .items (), key = lambda text_idx : text_idx [1 ])
375
- self .added_tokens_list = [text for (text , idx ) in items ]
376
- self .vocab_size_base : int = vocab_size
377
- self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
378
- self .fname_tokenizer = fname_tokenizer
370
+ new_tokens = {id : piece for piece , id in added_tokens .items () if id >= vocab_size }
371
+ expected_new_ids = list (range (vocab_size , vocab_size + len (new_tokens )))
372
+ actual_new_ids = sorted (new_tokens .keys ())
373
+
374
+ if expected_new_ids != actual_new_ids :
375
+ raise ValueError (f"Expected new token IDs { expected_new_ids } to be sequential; got { actual_new_ids } " )
376
+
377
+ # Token pieces that were added to the base vocabulary.
378
+ self .added_tokens_list = [new_tokens [id ] for id in actual_new_ids ]
379
+ self .vocab_size_base = vocab_size
380
+ self .vocab_size = self .vocab_size_base + len (self .added_tokens_list )
381
+ self .fname_tokenizer = fname_tokenizer
379
382
self .fname_added_tokens = fname_added_tokens
380
383
381
384
def sentencepiece_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
0 commit comments