diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py index fd840101569a9..f50a5a60e9704 100755 --- a/examples/convert-legacy-llama.py +++ b/examples/convert-legacy-llama.py @@ -7,6 +7,7 @@ import enum import faulthandler import functools +import hashlib import itertools import json import math @@ -24,7 +25,9 @@ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional +from transformers import PreTrainedTokenizerFast +from transformers.convert_slow_tokenizer import TikTokenConverter +from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional import numpy as np @@ -51,6 +54,7 @@ ADDED_TOKENS_FILE = 'added_tokens.json' FAST_TOKENIZER_FILE = 'tokenizer.json' +is_llama3_model = False # # data types @@ -523,6 +527,9 @@ def convert(name: str) -> LazyTensor: else: # split by rows axis = 0 + global is_llama3_model + if name.startswith('tok_embeddings.') and is_llama3_model: + axis = 0 concatenated_shape = list(lazy_tensors[0].shape) concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) @@ -896,6 +903,12 @@ def add_meta_vocab(self, vocab: Vocab) -> None: tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) # Add extracted token information for model conversion + # Tokenizer for LLaMA 3 + # Source: trust me bro + global is_llama3_model + if is_llama3_model: + self.gguf.add_tokenizer_model("gpt2") + self.gguf.add_tokenizer_pre("llama-bpe") self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) @@ -1208,7 +1221,7 @@ def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: try: vocab = cls(self.path) break - except FileNotFoundError: + except: pass # ignore unavailable tokenizers else: raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") @@ -1274,6 +1287,57 @@ def do_dump_model(model_plus: ModelPlus) -> None: for name, lazy_tensor in model_plus.model.items(): print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100 +# Tokenizer conversion for LLaMA 3 +# Credits: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py +class Llama3Converter(TikTokenConverter): + def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): + super().__init__(vocab_file, **kwargs) + tokenizer = self.converted() + chat_template = ( + "{% set loop_messages = messages %}" + "{% for message in loop_messages %}" + "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}" + "{% if loop.index0 == 0 %}" + "{% set content = bos_token + content %}" + "{% endif %}" + "{{ content }}" + "{% endfor %}" + "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" + ) + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] + tokenizer.add_special_tokens(special_tokens) + + self.tokenizer = PreTrainedTokenizerFast( + tokenizer_object=tokenizer, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + chat_template=chat_template, + model_input_names=["input_ids", "attention_mask"], + ) + +def write_llama3_tokenizer(tokenizer_path, input_tokenizer_path): + tokenizer = Llama3Converter(input_tokenizer_path).tokenizer + print(f"Saving a {tokenizer.__class__.__name__} to {tokenizer_path}.") + tokenizer.save_pretrained(tokenizer_path) + return tokenizer + +def is_llama3_tokenizer(tokenizer_path) -> bool: + llama3_tokenizer_model_hash : str = "82e9d31979e92ab929cd544440f129d9ecd797b69e327f80f17e1c50d5551b55" + with open(tokenizer_path, "rb") as f: + tokenizer_hash = hashlib.sha256(f.read()).hexdigest() + return llama3_tokenizer_model_hash == tokenizer_hash def main(args_in: list[str] | None = None) -> None: output_choices = ["f32", "f16"] @@ -1287,7 +1351,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") + parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft,bpe)", default="spm,hfft,bpe") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") @@ -1298,7 +1362,6 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--verbose", action="store_true", help="increase output verbosity") parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") - args = parser.parse_args(args_in) if args.verbose: @@ -1311,6 +1374,12 @@ def main(args_in: list[str] | None = None) -> None: metadata = Metadata.load(args.metadata) + #TODO: add more bandaids for llama 3 detection + if is_llama3_tokenizer(os.path.join(args.model, "tokenizer.model")): + global is_llama3_model + write_llama3_tokenizer(args.model, os.path.join(args.model, "tokenizer.model")) + is_llama3_model = True + if args.get_outfile: model_plus = load_some_model(args.model) params = Params.load(model_plus) @@ -1366,6 +1435,7 @@ def main(args_in: list[str] | None = None) -> None: logger.info(f"params = {params}") + model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_factory = VocabFactory(vocab_path)