|
24 | 24 | from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
25 | 25 | from dataclasses import dataclass
|
26 | 26 | from pathlib import Path
|
| 27 | +from transformers import PreTrainedTokenizerFast |
| 28 | +from transformers.convert_slow_tokenizer import TikTokenConverter |
27 | 29 | from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
28 | 30 |
|
29 | 31 | import numpy as np
|
@@ -1582,6 +1584,52 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
1582 | 1584 | for name, lazy_tensor in model_plus.model.items():
|
1583 | 1585 | print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
|
1584 | 1586 |
|
| 1587 | +# Tokenizer conversion for LLaMA 3 |
| 1588 | +# Credits: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py |
| 1589 | +class Llama3Converter(TikTokenConverter): |
| 1590 | + def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): |
| 1591 | + super().__init__(vocab_file, **kwargs) |
| 1592 | + tokenizer = self.converted() |
| 1593 | + chat_template = ( |
| 1594 | + "{% set loop_messages = messages %}" |
| 1595 | + "{% for message in loop_messages %}" |
| 1596 | + "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}" |
| 1597 | + "{% if loop.index0 == 0 %}" |
| 1598 | + "{% set content = bos_token + content %}" |
| 1599 | + "{% endif %}" |
| 1600 | + "{{ content }}" |
| 1601 | + "{% endfor %}" |
| 1602 | + "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" |
| 1603 | + ) |
| 1604 | + num_reserved_special_tokens = 256 |
| 1605 | + special_tokens = [ |
| 1606 | + "<|begin_of_text|>", |
| 1607 | + "<|end_of_text|>", |
| 1608 | + "<|reserved_special_token_0|>", |
| 1609 | + "<|reserved_special_token_1|>", |
| 1610 | + "<|reserved_special_token_2|>", |
| 1611 | + "<|reserved_special_token_3|>", |
| 1612 | + "<|start_header_id|>", |
| 1613 | + "<|end_header_id|>", |
| 1614 | + "<|reserved_special_token_4|>", |
| 1615 | + "<|eot_id|>", # end of turn |
| 1616 | + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] |
| 1617 | + tokenizer.add_special_tokens(special_tokens) |
| 1618 | + |
| 1619 | + self.tokenizer = PreTrainedTokenizerFast( |
| 1620 | + tokenizer_object=tokenizer, |
| 1621 | + bos_token="<|begin_of_text|>", |
| 1622 | + eos_token="<|end_of_text|>", |
| 1623 | + chat_template=chat_template, |
| 1624 | + model_input_names=["input_ids", "attention_mask"], |
| 1625 | + ) |
| 1626 | + |
| 1627 | +def write_llama3_tokenizer(tokenizer_path, input_tokenizer_path): |
| 1628 | + tokenizer = Llama3Converter(input_tokenizer_path).tokenizer |
| 1629 | + print(f"Saving a {tokenizer.__class__.__name__} to {tokenizer_path}.") |
| 1630 | + tokenizer.save_pretrained(tokenizer_path) |
| 1631 | + return tokenizer |
| 1632 | + |
1585 | 1633 |
|
1586 | 1634 | def main(args_in: list[str] | None = None) -> None:
|
1587 | 1635 | output_choices = ["f32", "f16"]
|
@@ -1621,8 +1669,7 @@ def main(args_in: list[str] | None = None) -> None:
|
1621 | 1669 | #TODO: add more bandaids for llama 3 detection
|
1622 | 1670 | try:
|
1623 | 1671 | global is_llama3_model
|
1624 |
| - import convert_llama_weights_to_hf |
1625 |
| - convert_llama_weights_to_hf.write_tokenizer(args.model, os.path.join(args.model, "tokenizer.model"), 3) |
| 1672 | + write_llama3_tokenizer(args.model, os.path.join(args.model, "tokenizer.model")) |
1626 | 1673 | is_llama3_model = True
|
1627 | 1674 | except:
|
1628 | 1675 | pass
|
|
0 commit comments