Skip to content

Commit ef69410

Browse files
committed
Merge HF tokenizer converter into convert.py
probably works idk, at least the model converted without issues
1 parent ca0dd58 commit ef69410

File tree

2 files changed

+49
-154
lines changed

2 files changed

+49
-154
lines changed

convert.py

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
2525
from dataclasses import dataclass
2626
from pathlib import Path
27+
from transformers import PreTrainedTokenizerFast
28+
from transformers.convert_slow_tokenizer import TikTokenConverter
2729
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
2830

2931
import numpy as np
@@ -1582,6 +1584,52 @@ def do_dump_model(model_plus: ModelPlus) -> None:
15821584
for name, lazy_tensor in model_plus.model.items():
15831585
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
15841586

1587+
# Tokenizer conversion for LLaMA 3
1588+
# Credits: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
1589+
class Llama3Converter(TikTokenConverter):
1590+
def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs):
1591+
super().__init__(vocab_file, **kwargs)
1592+
tokenizer = self.converted()
1593+
chat_template = (
1594+
"{% set loop_messages = messages %}"
1595+
"{% for message in loop_messages %}"
1596+
"{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}"
1597+
"{% if loop.index0 == 0 %}"
1598+
"{% set content = bos_token + content %}"
1599+
"{% endif %}"
1600+
"{{ content }}"
1601+
"{% endfor %}"
1602+
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
1603+
)
1604+
num_reserved_special_tokens = 256
1605+
special_tokens = [
1606+
"<|begin_of_text|>",
1607+
"<|end_of_text|>",
1608+
"<|reserved_special_token_0|>",
1609+
"<|reserved_special_token_1|>",
1610+
"<|reserved_special_token_2|>",
1611+
"<|reserved_special_token_3|>",
1612+
"<|start_header_id|>",
1613+
"<|end_header_id|>",
1614+
"<|reserved_special_token_4|>",
1615+
"<|eot_id|>", # end of turn
1616+
] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
1617+
tokenizer.add_special_tokens(special_tokens)
1618+
1619+
self.tokenizer = PreTrainedTokenizerFast(
1620+
tokenizer_object=tokenizer,
1621+
bos_token="<|begin_of_text|>",
1622+
eos_token="<|end_of_text|>",
1623+
chat_template=chat_template,
1624+
model_input_names=["input_ids", "attention_mask"],
1625+
)
1626+
1627+
def write_llama3_tokenizer(tokenizer_path, input_tokenizer_path):
1628+
tokenizer = Llama3Converter(input_tokenizer_path).tokenizer
1629+
print(f"Saving a {tokenizer.__class__.__name__} to {tokenizer_path}.")
1630+
tokenizer.save_pretrained(tokenizer_path)
1631+
return tokenizer
1632+
15851633

15861634
def main(args_in: list[str] | None = None) -> None:
15871635
output_choices = ["f32", "f16"]
@@ -1621,8 +1669,7 @@ def main(args_in: list[str] | None = None) -> None:
16211669
#TODO: add more bandaids for llama 3 detection
16221670
try:
16231671
global is_llama3_model
1624-
import convert_llama_weights_to_hf
1625-
convert_llama_weights_to_hf.write_tokenizer(args.model, os.path.join(args.model, "tokenizer.model"), 3)
1672+
write_llama3_tokenizer(args.model, os.path.join(args.model, "tokenizer.model"))
16261673
is_llama3_model = True
16271674
except:
16281675
pass

convert_llama_weights_to_hf.py

Lines changed: 0 additions & 152 deletions
This file was deleted.

0 commit comments

Comments
 (0)