Skip to content

Commit 8a2f2fe

Browse files
authored
convert : ignore tokens if their IDs are within [0, vocab_size) (#3831)
1 parent bd6d9e2 commit 8a2f2fe

File tree

1 file changed

+12
-9
lines changed

1 file changed

+12
-9
lines changed

convert.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -366,16 +366,19 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No
366366
added_tokens = {}
367367

368368
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
369-
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
370-
actual_ids = sorted(added_tokens.values())
371-
if expected_ids != actual_ids:
372-
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
373369

374-
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
375-
self.added_tokens_list = [text for (text, idx) in items]
376-
self.vocab_size_base: int = vocab_size
377-
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
378-
self.fname_tokenizer = fname_tokenizer
370+
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
371+
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
372+
actual_new_ids = sorted(new_tokens.keys())
373+
374+
if expected_new_ids != actual_new_ids:
375+
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
376+
377+
# Token pieces that were added to the base vocabulary.
378+
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
379+
self.vocab_size_base = vocab_size
380+
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
381+
self.fname_tokenizer = fname_tokenizer
379382
self.fname_added_tokens = fname_added_tokens
380383

381384
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:

0 commit comments

Comments
 (0)