-
Notifications
You must be signed in to change notification settings - Fork 12k
Merging #7568 with #7430(Implementing LLaMA 3 torch to gguf conversion) #7651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import enum | ||
import faulthandler | ||
import functools | ||
import hashlib | ||
import itertools | ||
import json | ||
import math | ||
|
@@ -24,7 +25,9 @@ | |
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional | ||
from transformers import PreTrainedTokenizerFast | ||
from transformers.convert_slow_tokenizer import TikTokenConverter | ||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional | ||
|
||
import numpy as np | ||
|
||
|
@@ -51,6 +54,7 @@ | |
|
||
ADDED_TOKENS_FILE = 'added_tokens.json' | ||
FAST_TOKENIZER_FILE = 'tokenizer.json' | ||
is_llama3_model = False | ||
|
||
# | ||
# data types | ||
|
@@ -523,6 +527,9 @@ def convert(name: str) -> LazyTensor: | |
else: | ||
# split by rows | ||
axis = 0 | ||
global is_llama3_model | ||
if name.startswith('tok_embeddings.') and is_llama3_model: | ||
axis = 0 | ||
concatenated_shape = list(lazy_tensors[0].shape) | ||
concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors) | ||
|
||
|
@@ -896,6 +903,12 @@ def add_meta_vocab(self, vocab: Vocab) -> None: | |
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab) | ||
|
||
# Add extracted token information for model conversion | ||
# Tokenizer for LLaMA 3 | ||
# Source: trust me bro | ||
global is_llama3_model | ||
if is_llama3_model: | ||
self.gguf.add_tokenizer_model("gpt2") | ||
self.gguf.add_tokenizer_pre("llama-bpe") | ||
self.gguf.add_token_list(tokens) | ||
self.gguf.add_token_scores(scores) | ||
self.gguf.add_token_types(toktypes) | ||
|
@@ -1208,7 +1221,7 @@ def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: | |
try: | ||
vocab = cls(self.path) | ||
break | ||
except FileNotFoundError: | ||
except: | ||
pass # ignore unavailable tokenizers | ||
else: | ||
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") | ||
|
@@ -1274,6 +1287,57 @@ def do_dump_model(model_plus: ModelPlus) -> None: | |
for name, lazy_tensor in model_plus.model.items(): | ||
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100 | ||
|
||
# Tokenizer conversion for LLaMA 3 | ||
# Credits: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py | ||
class Llama3Converter(TikTokenConverter): | ||
def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): | ||
super().__init__(vocab_file, **kwargs) | ||
tokenizer = self.converted() | ||
chat_template = ( | ||
"{% set loop_messages = messages %}" | ||
"{% for message in loop_messages %}" | ||
"{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}" | ||
"{% if loop.index0 == 0 %}" | ||
"{% set content = bos_token + content %}" | ||
"{% endif %}" | ||
"{{ content }}" | ||
"{% endfor %}" | ||
"{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" | ||
) | ||
num_reserved_special_tokens = 256 | ||
special_tokens = [ | ||
"<|begin_of_text|>", | ||
"<|end_of_text|>", | ||
"<|reserved_special_token_0|>", | ||
"<|reserved_special_token_1|>", | ||
"<|reserved_special_token_2|>", | ||
"<|reserved_special_token_3|>", | ||
"<|start_header_id|>", | ||
"<|end_header_id|>", | ||
"<|reserved_special_token_4|>", | ||
"<|eot_id|>", # end of turn | ||
] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] | ||
tokenizer.add_special_tokens(special_tokens) | ||
Comment on lines
+1296
to
+1320
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure about hardcoding the special tokens. These should be read from the It should also be possible to handle the chat template, bos and eos tokens simply and more generally with |
||
|
||
self.tokenizer = PreTrainedTokenizerFast( | ||
tokenizer_object=tokenizer, | ||
bos_token="<|begin_of_text|>", | ||
eos_token="<|end_of_text|>", | ||
chat_template=chat_template, | ||
model_input_names=["input_ids", "attention_mask"], | ||
) | ||
|
||
def write_llama3_tokenizer(tokenizer_path, input_tokenizer_path): | ||
tokenizer = Llama3Converter(input_tokenizer_path).tokenizer | ||
print(f"Saving a {tokenizer.__class__.__name__} to {tokenizer_path}.") | ||
tokenizer.save_pretrained(tokenizer_path) | ||
return tokenizer | ||
|
||
def is_llama3_tokenizer(tokenizer_path) -> bool: | ||
llama3_tokenizer_model_hash : str = "82e9d31979e92ab929cd544440f129d9ecd797b69e327f80f17e1c50d5551b55" | ||
with open(tokenizer_path, "rb") as f: | ||
tokenizer_hash = hashlib.sha256(f.read()).hexdigest() | ||
return llama3_tokenizer_model_hash == tokenizer_hash | ||
|
||
def main(args_in: list[str] | None = None) -> None: | ||
output_choices = ["f32", "f16"] | ||
|
@@ -1287,7 +1351,7 @@ def main(args_in: list[str] | None = None) -> None: | |
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab") | ||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") | ||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") | ||
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") | ||
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft,bpe)", default="spm,hfft,bpe") | ||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") | ||
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") | ||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") | ||
|
@@ -1298,7 +1362,6 @@ def main(args_in: list[str] | None = None) -> None: | |
parser.add_argument("--verbose", action="store_true", help="increase output verbosity") | ||
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file") | ||
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name") | ||
|
||
args = parser.parse_args(args_in) | ||
|
||
if args.verbose: | ||
|
@@ -1311,6 +1374,12 @@ def main(args_in: list[str] | None = None) -> None: | |
|
||
metadata = Metadata.load(args.metadata) | ||
|
||
#TODO: add more bandaids for llama 3 detection | ||
if is_llama3_tokenizer(os.path.join(args.model, "tokenizer.model")): | ||
global is_llama3_model | ||
write_llama3_tokenizer(args.model, os.path.join(args.model, "tokenizer.model")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if there's already a The convert scripts don't normally modify the model files, so I think there should be a presence check. |
||
is_llama3_model = True | ||
|
||
if args.get_outfile: | ||
model_plus = load_some_model(args.model) | ||
params = Params.load(model_plus) | ||
|
@@ -1366,6 +1435,7 @@ def main(args_in: list[str] | None = None) -> None: | |
|
||
logger.info(f"params = {params}") | ||
|
||
|
||
model_parent_path = model_plus.paths[0].parent | ||
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) | ||
vocab_factory = VocabFactory(vocab_path) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
convert-legacy-llama.py
did not depend ontransformers
before, I think (I might be mistaken).Is there a way this could be imported only when a Llama 3 model is detected? Maybe by defining the
Llama3Converter
class inwrite_llama3_tokenizer
, and importing at the start of that function?Otherwise a comment saying these are for Llama 3 might be helpful.