From 4823b9bdcb4cd7c96b62efc30091b6f5c1596c35 Mon Sep 17 00:00:00 2001 From: Galunid Date: Thu, 26 Oct 2023 13:08:41 +0200 Subject: [PATCH 01/42] Initial generic convert script --- convert-generic.py | 77 +++++++++++++ gguf-py/gguf/util.py | 23 ++++ model.py | 261 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 361 insertions(+) create mode 100755 convert-generic.py create mode 100644 gguf-py/gguf/util.py create mode 100644 model.py diff --git a/convert-generic.py b/convert-generic.py new file mode 100755 index 0000000000000..573d208b96478 --- /dev/null +++ b/convert-generic.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# HF stablelm --> gguf conversion + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) + +import gguf +import model +import util + +args = util.parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + + +print("gguf: loading model " + dir_model.name) + +hparams = model.Model.load_hparams(dir_model) + +model_class = model.Model.from_model_architecture(hparams["architectures"][0]) +model_instance = model_class(dir_model, ftype) +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[model_instance.model_arch]) + +print("gguf: get model metadata") + +model_instance.set_gguf_parameters(gguf_writer) + +# TOKENIZATION +print("gguf: get tokenizer metadata") +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +tokens, toktypes = model.Model.load_vocab_gpt2(model_instance.dir_model, model_instance.hparams) +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# write model +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + model_instance.write_tensors(gguf_writer) + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/gguf-py/gguf/util.py b/gguf-py/gguf/util.py new file mode 100644 index 0000000000000..fcb83b54933fd --- /dev/null +++ b/gguf-py/gguf/util.py @@ -0,0 +1,23 @@ +import argparse + +from pathlib import Path + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a stablelm model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() diff --git a/model.py b/model.py new file mode 100644 index 0000000000000..19dfc800febc1 --- /dev/null +++ b/model.py @@ -0,0 +1,261 @@ +import os +import re +import sys +import json +import gguf +import torch +import contextlib +import numpy as np + +from pathlib import Path + + +class Model: + def __init__(self, dir_model: Path, ftype: int): + self.dir_model = dir_model + self.ftype = ftype + self.is_safetensors = not self._is_model_safetensors() + self.num_parts = Model.count_model_parts(self.dir_model, ".bin" if self.is_safetensors else ".bin") + self.part_names = self._get_part_names() + self.hparams = Model.load_hparams(self.dir_model) + self.model_arch = self._get_model_architecture() + + def _is_model_safetensors(self) -> bool: + return Model.count_model_parts(self.dir_model, ".safetensors") > 0 + + def _get_part_names(self): + if self.is_safetensors: + if self.num_parts == 1: # there's only one .safetensors file + return ("model.safetensors",) + return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) + else: + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + + def _get_model_architecture(self): + arch = self.hparams["architectures"][0] + if arch == "GPTNeoXForCausalLM": + return gguf.MODEL_ARCH.GPTNEOX + if arch == "BloomForCausalLM": + return gguf.MODEL_ARCH.BLOOM + raise NotImplementedError(f'Architecture "{arch}" not supported!') + + def get_tensors(self): + for part_name in self.part_names: + print("gguf: loading model part '" + part_name + "'") + if self.is_safetensors: + from safetensors import safe_open + ctx = safe_open(self.dir_model / part_name, framework="pt", device="cpu") + else: + ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + yield name, data + + def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): + gguf_writer.add_name(self.dir_model.name) + gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + if "max_position_embeddings" in self.hparams: + gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + if "hidden_size" in self.hparams: + gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + if "intermediate_size" in self.hparams: + gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + if "num_attention_head" in self.hparams: + gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + + def write_tensors(self, gguf_writer: gguf.GGUFWriter): + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data in self.get_tensors(): + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + return hparams + + @staticmethod + def load_vocab_gpt2(dir_model: Path, hparams): + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture == "StableLMEpochForCausalLM": + return StableLMModel + if model_architecture == "GPTNeoXForCausalLM": + return GPTNeoXModel + if model_architecture == "BloomForCausalLM": + return BloomModel + return Model + +class StableLMModel(Model): + def set_gguf_parameters(self, gguf_writer): + super().set_gguf_parameters(gguf_writer) + gguf_writer.add_rope_dimension_count(int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) + gguf_writer.add_layer_norm_eps(1e-5) + + +class GPTNeoXModel(Model): + pass + +class BloomModel(Model): + def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): + gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + gguf_writer.add_embedding_length(n_embed) + gguf_writer.add_feed_forward_length(4 * n_embed) + gguf_writer.add_block_count(self.hparams["n_layer"]) + gguf_writer.add_head_count(n_head) + gguf_writer.add_head_count_kv(n_head) + gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + gguf_writer.add_file_type(self.ftype) + + def write_tensors(self, gguf_writer): + block_count = self.hparams["n_layer"] + tensors = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + has_lm_head = True + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + for name, data in tensors.items(): + if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): + has_lm_head = False + + name = re.sub(r'transformer\.', '', name) + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed))), + axis=0 + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + (qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,))), + axis=0 + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + gguf_writer.add_tensor("output.weight", data) + print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa From 22201248a093c56e914cc61e3a0b29e562c8fe52 Mon Sep 17 00:00:00 2001 From: Galunid Date: Fri, 27 Oct 2023 02:05:27 +0200 Subject: [PATCH 02/42] Remove comments --- model.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/model.py b/model.py index 19dfc800febc1..aaee9943065f6 100644 --- a/model.py +++ b/model.py @@ -14,8 +14,8 @@ class Model: def __init__(self, dir_model: Path, ftype: int): self.dir_model = dir_model self.ftype = ftype - self.is_safetensors = not self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".bin" if self.is_safetensors else ".bin") + self.is_safetensors = self._is_model_safetensors() + self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() @@ -39,6 +39,8 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.GPTNEOX if arch == "BloomForCausalLM": return gguf.MODEL_ARCH.BLOOM + if arch == "MPTForCausalLM": + return gguf.MODEL_ARCH.MPT raise NotImplementedError(f'Architecture "{arch}" not supported!') def get_tensors(self): @@ -57,7 +59,7 @@ def get_tensors(self): def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_name(self.dir_model.name) - gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))) if "max_position_embeddings" in self.hparams: gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) if "hidden_size" in self.hparams: @@ -69,7 +71,7 @@ def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) def write_tensors(self, gguf_writer: gguf.GGUFWriter): - block_count = self.hparams["num_hidden_layers"] + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data in self.get_tensors(): # we don't need these @@ -161,6 +163,8 @@ def from_model_architecture(model_architecture): return GPTNeoXModel if model_architecture == "BloomForCausalLM": return BloomModel + if model_architecture == "MPTForCausalLM": + return MPTModel return Model class StableLMModel(Model): @@ -259,3 +263,19 @@ def write_tensors(self, gguf_writer): if not has_lm_head and name == "word_embeddings.weight": gguf_writer.add_tensor("output.weight", data) print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa + +class MPTModel(Model): + def set_gguf_parameters(self, gguf_writer): + block_count = self.hparams["n_layers"] + gguf_writer.add_name(self.dir_model.name) + gguf_writer.add_context_length(self.hparams["max_seq_len"]) + gguf_writer.add_embedding_length(self.hparams["d_model"]) + gguf_writer.add_block_count(block_count) + gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + gguf_writer.add_head_count_kv(kv_n_heads) + gguf_writer.add_layer_norm_eps(1e-05) + if self.hparams["attn_config"]["clip_qkv"] is not None: + gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) From 0ff237105dee6db2bdc5b3259f9ad36814ae8b44 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 29 Oct 2023 00:33:05 +0200 Subject: [PATCH 03/42] Make gguf_writer member of Model, rework tokenizer export --- convert-generic.py | 24 +++----- model.py | 145 ++++++++++++++++++++++++--------------------- 2 files changed, 85 insertions(+), 84 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index 573d208b96478..344a31dcf695b 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# HF stablelm --> gguf conversion from __future__ import annotations @@ -41,37 +40,30 @@ hparams = model.Model.load_hparams(dir_model) model_class = model.Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype) -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[model_instance.model_arch]) +model_instance = model_class(dir_model, ftype, fname_out) print("gguf: get model metadata") -model_instance.set_gguf_parameters(gguf_writer) +model_instance.set_gguf_parameters() # TOKENIZATION print("gguf: get tokenizer metadata") -gguf_writer.add_tokenizer_model("gpt2") print("gguf: get gpt2 tokenizer vocab") -tokens, toktypes = model.Model.load_vocab_gpt2(model_instance.dir_model, model_instance.hparams) -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) -special_vocab.add_to_gguf(gguf_writer) +model_instance.set_vocab() # write model print("gguf: write header") -gguf_writer.write_header_to_file() +model_instance.gguf_writer.write_header_to_file() print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() +model_instance.gguf_writer.write_kv_data_to_file() if not args.vocab_only: print("gguf: write tensors") - model_instance.write_tensors(gguf_writer) - gguf_writer.write_tensors_to_file() + model_instance.write_tensors() + model_instance.gguf_writer.write_tensors_to_file() -gguf_writer.close() +model_instance.gguf_writer.close() print(f"gguf: model successfully exported to '{fname_out}'") print("") diff --git a/model.py b/model.py index aaee9943065f6..8be445cb809c8 100644 --- a/model.py +++ b/model.py @@ -11,14 +11,16 @@ class Model: - def __init__(self, dir_model: Path, ftype: int): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path): self.dir_model = dir_model self.ftype = ftype + self.fname_out = fname_out self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch]) def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -43,6 +45,41 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.MPT raise NotImplementedError(f'Architecture "{arch}" not supported!') + def set_vocab(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) + special_vocab.add_to_gguf(self.gguf_writer) + def get_tensors(self): for part_name in self.part_names: print("gguf: loading model part '" + part_name + "'") @@ -57,20 +94,20 @@ def get_tensors(self): data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] yield name, data - def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): - gguf_writer.add_name(self.dir_model.name) - gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))) + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))) if "max_position_embeddings" in self.hparams: - gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) if "hidden_size" in self.hparams: - gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) if "intermediate_size" in self.hparams: - gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) if "num_attention_head" in self.hparams: - gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) - def write_tensors(self, gguf_writer: gguf.GGUFWriter): + def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data in self.get_tensors(): @@ -109,7 +146,7 @@ def write_tensors(self, gguf_writer: gguf.GGUFWriter): print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) + self.gguf_writer.add_tensor(new_name, data) @staticmethod def count_model_parts(dir_model: Path, prefix: str) -> int: @@ -126,34 +163,6 @@ def load_hparams(dir_model): hparams = json.load(f) return hparams - @staticmethod - def load_vocab_gpt2(dir_model: Path, hparams): - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - return tokens, toktypes @staticmethod def from_model_architecture(model_architecture): @@ -168,30 +177,30 @@ def from_model_architecture(model_architecture): return Model class StableLMModel(Model): - def set_gguf_parameters(self, gguf_writer): - super().set_gguf_parameters(gguf_writer) - gguf_writer.add_rope_dimension_count(int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) - gguf_writer.add_layer_norm_eps(1e-5) + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_dimension_count(int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) + self.gguf_writer.add_layer_norm_eps(1e-5) class GPTNeoXModel(Model): pass class BloomModel(Model): - def set_gguf_parameters(self, gguf_writer: gguf.GGUFWriter): - gguf_writer.add_name("Bloom") + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - gguf_writer.add_embedding_length(n_embed) - gguf_writer.add_feed_forward_length(4 * n_embed) - gguf_writer.add_block_count(self.hparams["n_layer"]) - gguf_writer.add_head_count(n_head) - gguf_writer.add_head_count_kv(n_head) - gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - gguf_writer.add_file_type(self.ftype) - - def write_tensors(self, gguf_writer): + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): block_count = self.hparams["n_layer"] tensors = dict(self.get_tensors()) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) @@ -258,24 +267,24 @@ def write_tensors(self, gguf_writer): print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) + self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "word_embeddings.weight": - gguf_writer.add_tensor("output.weight", data) + self.gguf_writer.add_tensor("output.weight", data) print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa class MPTModel(Model): - def set_gguf_parameters(self, gguf_writer): + def set_gguf_parameters(self): block_count = self.hparams["n_layers"] - gguf_writer.add_name(self.dir_model.name) - gguf_writer.add_context_length(self.hparams["max_seq_len"]) - gguf_writer.add_embedding_length(self.hparams["d_model"]) - gguf_writer.add_block_count(block_count) - gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - gguf_writer.add_head_count_kv(kv_n_heads) - gguf_writer.add_layer_norm_eps(1e-05) + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-05) if self.hparams["attn_config"]["clip_qkv"] is not None: - gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) From 8618b4e74c7919ef704ec774eb87c780988327ee Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 29 Oct 2023 01:38:35 +0200 Subject: [PATCH 04/42] Add [UNTESTED] Baichuan support --- model.py | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/model.py b/model.py index 8be445cb809c8..9291666c5f92d 100644 --- a/model.py +++ b/model.py @@ -8,7 +8,9 @@ import numpy as np from pathlib import Path +from typing import TypeAlias, Any +NDArray: TypeAlias = 'np.ndarray[Any, Any]' class Model: def __init__(self, dir_model: Path, ftype: int, fname_out: Path): @@ -43,6 +45,8 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.BLOOM if arch == "MPTForCausalLM": return gguf.MODEL_ARCH.MPT + if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return gguf.MODEL_ARCH.BAICHUAN raise NotImplementedError(f'Architecture "{arch}" not supported!') def set_vocab(self): @@ -174,6 +178,8 @@ def from_model_architecture(model_architecture): return BloomModel if model_architecture == "MPTForCausalLM": return MPTModel + if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return BaichuanModel return Model class StableLMModel(Model): @@ -288,3 +294,180 @@ def set_gguf_parameters(self): if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + +class BaichuanModel(Model): + def set_vocab(self): + from sentencepiece import SentencePieceProcessor # type: ignore[import] + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + tokenizer_model_file = self.dir_model / 'tokenizer.model' + if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) + + # vocab type sentencepiece + print("gguf: get sentencepiece tokenizer vocab, scores and token types") + + tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) + vocab_size = self.hparams.get('vocab_size') + if vocab_size is None: + vocab_size = tokenizer.vocab_size() + + for i in range(vocab_size): + text: bytes + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) + + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 + + # toktype = 4 is user-defined = tokens from added_tokens.json + + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) + + print("gguf: get added tokens") + + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type + + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab = len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + + if "num_key_value_heads" in self.hparams: + head_count_kv = self.hparams["num_key_value_heads"] + else: + head_count_kv = head_count + + if "_name_or_path" in self.hparams: + hf_repo = self.hparams["_name_or_path"] + else: + hf_repo = "" + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if "rope_scaling" in self.hparams and self.hparams["rope_scaling"] != None and "factor" in self.hparams["rope_scaling"]: + if "type" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"]["type"] == "linear": + self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) + + + def _reverse_hf_permute(self, weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def _reverse_hf_permute_part(self, weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: + r = weights.shape[0] // 3 + return (self._reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + + def _reverse_hf_part(self, weights: NDArray, n_part: int) -> NDArray: + r = weights.shape[0] // 3 + return weights[r * n_part : r * n_part + r, ...] + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for i in range(block_count): + if f"model.layers.{i}.self_attn.W_pack.weight" in model_kv: + print(f"Unpacking and permuting layer {i}") + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = self._reverse_hf_permute_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = self._reverse_hf_permute_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = self._reverse_hf_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],2) + del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name, data in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + self.gguf_writer.add_tensor(new_name, data) + From 989db341493144fd3c3a49b7fca786eff9a5d9ef Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 29 Oct 2023 02:05:28 +0100 Subject: [PATCH 05/42] Missing variable --- model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/model.py b/model.py index 9291666c5f92d..66b350f654aa3 100644 --- a/model.py +++ b/model.py @@ -424,6 +424,7 @@ def write_tensors(self): # Collect tensors from generator object model_kv = dict(self.get_tensors()) block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for i in range(block_count): From 550b925af2281dfe414b75a42589bc6cbad7c725 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 29 Oct 2023 02:06:41 +0100 Subject: [PATCH 06/42] Missing variable --- model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/model.py b/model.py index 66b350f654aa3..c35fe3bbf361c 100644 --- a/model.py +++ b/model.py @@ -427,6 +427,12 @@ def write_tensors(self): head_count = self.hparams["num_attention_heads"] tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + if "num_key_value_heads" in self.hparams: + head_count_kv = self.hparams["num_key_value_heads"] + else: + head_count_kv = head_count + + for i in range(block_count): if f"model.layers.{i}.self_attn.W_pack.weight" in model_kv: print(f"Unpacking and permuting layer {i}") From 443f7d586ee84cfb614b7f282836b86c5180dfa8 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 29 Oct 2023 20:00:54 +0100 Subject: [PATCH 07/42] Call add_tensor before write_* functions --- convert-generic.py | 15 ++++++++++----- model.py | 13 ++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index 344a31dcf695b..bd366d2507623 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -54,14 +54,19 @@ model_instance.set_vocab() # write model -print("gguf: write header") -model_instance.gguf_writer.write_header_to_file() -print("gguf: write metadata") -model_instance.gguf_writer.write_kv_data_to_file() if not args.vocab_only: - print("gguf: write tensors") model_instance.write_tensors() + print("gguf: write header") + model_instance.gguf_writer.write_header_to_file() + print("gguf: write metadata") + model_instance.gguf_writer.write_kv_data_to_file() + print("gguf: write tensors") model_instance.gguf_writer.write_tensors_to_file() +else: + print("gguf: write header") + model_instance.gguf_writer.write_header_to_file() + print("gguf: write metadata") + model_instance.gguf_writer.write_kv_data_to_file() model_instance.gguf_writer.close() diff --git a/model.py b/model.py index c35fe3bbf361c..a3c53fcc0e684 100644 --- a/model.py +++ b/model.py @@ -190,7 +190,18 @@ def set_gguf_parameters(self): class GPTNeoXModel(Model): - pass + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(int(self.hparams["rotary_pct"]*(self.hparams["hidden_size"]//self.hparams["num_attention_heads"]))) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) class BloomModel(Model): def set_gguf_parameters(self): From 08918b700ef90604a8ed0b83696e4007a1db9d72 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 01:52:55 +0100 Subject: [PATCH 08/42] MPT conversion fix --- model.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/model.py b/model.py index a3c53fcc0e684..c60ca10d06563 100644 --- a/model.py +++ b/model.py @@ -95,6 +95,7 @@ def get_tensors(self): with ctx as model_part: for name in model_part.keys(): + print("yield ", name) data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] yield name, data @@ -306,6 +307,54 @@ def set_gguf_parameters(self): self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data in self.get_tensors(): + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + self.gguf_writer.add_tensor(new_name, data) + + # note: MPT output is tied to (same as) wte in original model; + # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ + if new_name == "token_embd.weight": + self.gguf_writer.add_tensor("output.weight", data) + + + class BaichuanModel(Model): def set_vocab(self): from sentencepiece import SentencePieceProcessor # type: ignore[import] From 3bb9844de9367c97a2473710961384116c1e2c61 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 01:54:24 +0100 Subject: [PATCH 09/42] Get rid of dumb print --- model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model.py b/model.py index c60ca10d06563..40ad6869a9e05 100644 --- a/model.py +++ b/model.py @@ -95,7 +95,6 @@ def get_tensors(self): with ctx as model_part: for name in model_part.keys(): - print("yield ", name) data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] yield name, data From 0afa75a9a219f8e37d1727b94864b75ca80bd401 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 02:13:45 +0100 Subject: [PATCH 10/42] Add Falcon support --- model.py | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/model.py b/model.py index 40ad6869a9e05..3b932481ce108 100644 --- a/model.py +++ b/model.py @@ -47,6 +47,9 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.MPT if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): return gguf.MODEL_ARCH.BAICHUAN + if arch == "FalconForCausalLM": + return gguf.MODEL_ARCH.FALCON + raise NotImplementedError(f'Architecture "{arch}" not supported!') def set_vocab(self): @@ -180,6 +183,8 @@ def from_model_architecture(model_architecture): return MPTModel if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): return BaichuanModel + if model_architecture == "FalconForCausalLM": + return FalconModel return Model class StableLMModel(Model): @@ -537,3 +542,96 @@ def write_tensors(self): print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) self.gguf_writer.add_tensor(new_name, data) + +class FalconModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + head_dim = self.hparams["hidden_size"] // n_head + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data in self.get_tensors(): + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data = torch.cat((q,k,v)).reshape_as(data) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + self.gguf_writer.add_tensor(new_name, data) + From 94ba1db24afa71b6cd4ff661742f86956f230144 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 03:12:25 +0100 Subject: [PATCH 11/42] Add Starcoder and Refact --- model.py | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/model.py b/model.py index 3b932481ce108..aee0ed5fe80f7 100644 --- a/model.py +++ b/model.py @@ -49,6 +49,10 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.BAICHUAN if arch == "FalconForCausalLM": return gguf.MODEL_ARCH.FALCON + if arch == "GPTBigCodeForCausalLM": + return gguf.MODEL_ARCH.STARCODER + if arch == "GPTRefactForCausalLM": + return gguf.MODEL_ARCH.REFACT raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -185,6 +189,10 @@ def from_model_architecture(model_architecture): return BaichuanModel if model_architecture == "FalconForCausalLM": return FalconModel + if model_architecture == "GPTBigCodeForCausalLM": + return StarCoderModel + if model_architecture == "GPTRefactForCausalLM": + return RefactModel return Model class StableLMModel(Model): @@ -635,3 +643,109 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +class StarCoderModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +class RefactModel(Model): + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") +# refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + block_count = self.hparams["n_layer"] + tensors = dict(self.get_tensors()) + for i in range(block_count): + if f"transformer.h.{i}.attn.kv.weight" in tensors: + data = tensors[f"transformer.h.{i}.attn.kv.weight"] + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ + : n_head_kv * head_dim + ] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ + n_head_kv * head_dim : + ] + del tensors[f"transformer.h.{i}.attn.kv.weight"] + if f"transformer.h.{i}.attn.q.weight" in tensors: + tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = tensors[ + f"transformer.h.{i}.attn.q.weight" + ] + del tensors[f"transformer.h.{i}.attn.q.weight"] + if f"transformer.h.{i}.mlp.gate_up_proj.weight" in tensors: + data = tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] + del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + + for name, data in tensors.items(): + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + self.gguf_writer.add_tensor(new_name, data) + From 6f6856c6ea0b5e39db296bc236884a6e3af23883 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 03:27:04 +0100 Subject: [PATCH 12/42] [Untested] Initial Persimmon support --- model.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/model.py b/model.py index aee0ed5fe80f7..a15682a349876 100644 --- a/model.py +++ b/model.py @@ -53,6 +53,7 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.STARCODER if arch == "GPTRefactForCausalLM": return gguf.MODEL_ARCH.REFACT + if arch == "" raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -749,3 +750,85 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) +class PersimmonModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = head_count + ctx_length = self.hparams["seq_length"] + hidden_size = self.hparams["hidden_size"] + + self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["ffn_hidden_size"]) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layernorm_epsilon"]) + + def set_vocab(self): + tokens, scores, toktypes = self._get_sentencepiece_tokenizer_info() + self.gguf_writer.add_tokenizer_model('llama') + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_bos_token_id(71013) + # self.gguf_writer.add_eos_token_id(71013) + + def write_tensors(self): + block_count = self.hparams["num_layers"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + print(tensor_map) + for name, data in self.get_tensors(): + if name.endswith(".self_attention.rotary_emb.inv_freq"): + continue + old_dtype = data.dtype + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + data = data.to(torch.float32).squeeze().numpy() + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + n_dims = len(data.shape) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + self.gguf_writer.add_tensor(new_name, data) + + + def _get_sentencepiece_tokenizer_info(self): + from sentencepiece import SentencePieceProcessor + tokenizer_path = self.dir_model / 'tokenizer.model' + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + + print('gguf: getting sentencepiece tokenizer from', tokenizer_path) + print('gguf: adding tokens') + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + for i in range(tokenizer.vocab_size()): + text: bytes + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) + + toktype = 1 + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + pass + return tokens, scores, toktypes + From b9c664ab2f665a70f53f29cc82753a37418de76d Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 03:42:55 +0100 Subject: [PATCH 13/42] Woops --- model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index a15682a349876..814623f23deb6 100644 --- a/model.py +++ b/model.py @@ -53,7 +53,8 @@ def _get_model_architecture(self): return gguf.MODEL_ARCH.STARCODER if arch == "GPTRefactForCausalLM": return gguf.MODEL_ARCH.REFACT - if arch == "" + if arch == "PersimmonForCausalLM": + return gguf.MODEL_ARCH.PERSIMMON raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -194,6 +195,8 @@ def from_model_architecture(model_architecture): return StarCoderModel if model_architecture == "GPTRefactForCausalLM": return RefactModel + if model_architecture == "PersimmonForCausalLM": + return PersimmonModel return Model class StableLMModel(Model): From 0743f7a9006e72696ddc82472b8a20a575e82c15 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 03:52:52 +0100 Subject: [PATCH 14/42] Fix variable --- model.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/model.py b/model.py index 814623f23deb6..fb6aa8ebe708d 100644 --- a/model.py +++ b/model.py @@ -693,10 +693,10 @@ def write_tensors(self): n_head = self.hparams["n_head"] n_head_kv = 1 head_dim = self.hparams["n_embd"] // n_head + block_count = self.hparams["n_layer"] tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - block_count = self.hparams["n_layer"] tensors = dict(self.get_tensors()) for i in range(block_count): if f"transformer.h.{i}.attn.kv.weight" in tensors: @@ -755,14 +755,12 @@ def write_tensors(self): class PersimmonModel(Model): def set_gguf_parameters(self): - block_count = self.hparams["num_layers"] + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) head_count = self.hparams["num_attention_heads"] head_count_kv = head_count - ctx_length = self.hparams["seq_length"] hidden_size = self.hparams["hidden_size"] self.gguf_writer.add_name('persimmon-8b-chat') - self.gguf_writer.add_context_length(ctx_length) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["ffn_hidden_size"]) @@ -782,7 +780,7 @@ def set_vocab(self): # self.gguf_writer.add_eos_token_id(71013) def write_tensors(self): - block_count = self.hparams["num_layers"] + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) print(tensor_map) for name, data in self.get_tensors(): From dc3115f2a39cbb3bf465878f02a13b16644bc7ce Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 04:20:51 +0100 Subject: [PATCH 15/42] Add another alias to n_layers --- model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model.py b/model.py index fb6aa8ebe708d..1b49ac7eb1402 100644 --- a/model.py +++ b/model.py @@ -109,7 +109,7 @@ def get_tensors(self): def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))) + self.gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))) if "max_position_embeddings" in self.hparams: self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) if "hidden_size" in self.hparams: @@ -121,7 +121,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data in self.get_tensors(): # we don't need these From b2ba44eab247a3c079d3fd581542324ca326022b Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 15:38:24 +0100 Subject: [PATCH 16/42] Flake8 fixes --- convert-generic.py | 14 +++------ model.py | 77 +++++++++++++++++++++++++--------------------- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index bd366d2507623..0e69d35c90279 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -1,24 +1,18 @@ #!/usr/bin/env python3 from __future__ import annotations +from util import parse_args -import os import sys -from pathlib import Path - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) - -import gguf import model -import util -args = util.parse_args() + +args = parse_args() dir_model = args.model ftype = args.ftype if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) + print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) # possible tensor data types diff --git a/model.py b/model.py index 1b49ac7eb1402..7b31eeaa81612 100644 --- a/model.py +++ b/model.py @@ -12,6 +12,7 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]' + class Model: def __init__(self, dir_model: Path, ftype: int, fname_out: Path): self.dir_model = dir_model @@ -90,7 +91,7 @@ def set_vocab(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) def get_tensors(self): @@ -109,7 +110,8 @@ def get_tensors(self): def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))) + self.gguf_writer.add_block_count(self.hparams.get( + "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))) if "max_position_embeddings" in self.hparams: self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) if "hidden_size" in self.hparams: @@ -118,7 +120,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) if "num_attention_head" in self.hparams: self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + self.gguf_writer.add_parallel_residual( + self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) @@ -137,7 +140,7 @@ def write_tensors(self): data = data.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -176,7 +179,6 @@ def load_hparams(dir_model): hparams = json.load(f) return hparams - @staticmethod def from_model_architecture(model_architecture): if model_architecture == "StableLMEpochForCausalLM": @@ -199,10 +201,12 @@ def from_model_architecture(model_architecture): return PersimmonModel return Model + class StableLMModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_rope_dimension_count(int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) self.gguf_writer.add_layer_norm_eps(1e-5) @@ -215,11 +219,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(self.hparams["rotary_pct"]*(self.hparams["hidden_size"]//self.hparams["num_attention_heads"]))) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"]*(self.hparams["hidden_size"]//self.hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + self.gguf_writer.add_parallel_residual( + self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + class BloomModel(Model): def set_gguf_parameters(self): self.gguf_writer.add_name("Bloom") @@ -307,6 +314,7 @@ def write_tensors(self): self.gguf_writer.add_tensor("output.weight", data) print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa + class MPTModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layers"] @@ -340,7 +348,7 @@ def write_tensors(self): data = data.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -370,7 +378,6 @@ def write_tensors(self): self.gguf_writer.add_tensor("output.weight", data) - class BaichuanModel(Model): def set_vocab(self): from sentencepiece import SentencePieceProcessor # type: ignore[import] @@ -380,7 +387,7 @@ def set_vocab(self): tokenizer_model_file = self.dir_model / 'tokenizer.model' if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + print(f'Error: Missing {tokenizer_model_file}', file=sys.stderr) sys.exit(1) # vocab type sentencepiece @@ -424,17 +431,16 @@ def set_vocab(self): print("gguf: get added tokens") for key in addtokens_json: - tokens.append( key.encode("utf-8") ) + tokens.append(key.encode("utf-8")) scores.append(-1000.0) - toktypes.append(4) # user-defined token type - + toktypes.append(4) # user-defined token type self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab = len(tokens)) + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -474,12 +480,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if "rope_scaling" in self.hparams and self.hparams["rope_scaling"] != None and "factor" in self.hparams["rope_scaling"]: + if "rope_scaling" in self.hparams and self.hparams["rope_scaling"] is not None and "factor" in self.hparams["rope_scaling"]: if "type" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"]["type"] == "linear": self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) - def _reverse_hf_permute(self, weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head @@ -488,13 +493,13 @@ def _reverse_hf_permute(self, weights: NDArray, n_head: int, n_kv_head: int | No .swapaxes(1, 2) .reshape(weights.shape)) - def _reverse_hf_permute_part(self, weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: + def _reverse_hf_permute_part(self, weights: NDArray, n_part: int, n_head: int, n_head_kv: int | None = None) -> NDArray: r = weights.shape[0] // 3 - return (self._reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + return (self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)) def _reverse_hf_part(self, weights: NDArray, n_part: int) -> NDArray: r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] + return weights[r * n_part:r * n_part + r, ...] def write_tensors(self): # Collect tensors from generator object @@ -508,13 +513,15 @@ def write_tensors(self): else: head_count_kv = head_count - for i in range(block_count): if f"model.layers.{i}.self_attn.W_pack.weight" in model_kv: print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = self._reverse_hf_permute_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = self._reverse_hf_permute_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = self._reverse_hf_part(model_kv[f"model.layers.{i}.self_attn.W_pack.weight"],2) + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = self._reverse_hf_permute_part( + model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = self._reverse_hf_permute_part( + model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = self._reverse_hf_part( + model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 2) del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] for name, data in model_kv.items(): @@ -531,7 +538,7 @@ def write_tensors(self): data = data.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -551,7 +558,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) self.gguf_writer.add_tensor(new_name, data) @@ -570,8 +577,8 @@ def set_gguf_parameters(self): n_head_kv = self.hparams.get("n_head_kv", 1) # old name self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -615,15 +622,15 @@ def write_tensors(self): if "query_key_value" in name: qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data = torch.cat((q,k,v)).reshape_as(data) + data = torch.cat((q, k, v)).reshape_as(data) data = data.squeeze().numpy() # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -647,6 +654,7 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) + class StarCoderModel(Model): def set_gguf_parameters(self): block_count = self.hparams["n_layer"] @@ -705,7 +713,7 @@ def write_tensors(self): : n_head_kv * head_dim ] tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ - n_head_kv * head_dim : + n_head_kv * head_dim: ] del tensors[f"transformer.h.{i}.attn.kv.weight"] if f"transformer.h.{i}.attn.q.weight" in tensors: @@ -753,6 +761,7 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) + class PersimmonModel(Model): def set_gguf_parameters(self): block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) @@ -789,7 +798,7 @@ def write_tensors(self): old_dtype = data.dtype # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) data = data.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -797,7 +806,6 @@ def write_tensors(self): print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) self.gguf_writer.add_tensor(new_name, data) - def _get_sentencepiece_tokenizer_info(self): from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -832,4 +840,3 @@ def _get_sentencepiece_tokenizer_info(self): toktypes.append(toktype) pass return tokens, scores, toktypes - From c94df0973281fd66049cb0f16e62da2fa7d9f68b Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 16:11:08 +0100 Subject: [PATCH 17/42] Rework tokenizer handling --- model.py | 176 ++++++++++++++++++++++--------------------------------- 1 file changed, 71 insertions(+), 105 deletions(-) diff --git a/model.py b/model.py index 7b31eeaa81612..5d431d6bc605d 100644 --- a/model.py +++ b/model.py @@ -7,12 +7,22 @@ import contextlib import numpy as np +from enum import Enum from pathlib import Path from typing import TypeAlias, Any NDArray: TypeAlias = 'np.ndarray[Any, Any]' +class SentencePieceTokenTypes(Enum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + class Model: def __init__(self, dir_model: Path, ftype: int, fname_out: Path): self.dir_model = dir_model @@ -59,7 +69,7 @@ def _get_model_architecture(self): raise NotImplementedError(f'Architecture "{arch}" not supported!') - def set_vocab(self): + def _set_vocab_gpt2(self): dir_model = self.dir_model hparams = self.hparams tokens: list[bytearray] = [] @@ -94,6 +104,62 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_vocab(self): + self._set_vocab_gpt2() + def get_tensors(self): for part_name in self.part_names: print("gguf: loading model part '" + part_name + "'") @@ -380,68 +446,7 @@ def write_tensors(self): class BaichuanModel(Model): def set_vocab(self): - from sentencepiece import SentencePieceProcessor # type: ignore[import] - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - tokenizer_model_file = self.dir_model / 'tokenizer.model' - if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file=sys.stderr) - sys.exit(1) - - # vocab type sentencepiece - print("gguf: get sentencepiece tokenizer vocab, scores and token types") - - tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - vocab_size = self.hparams.get('vocab_size') - if vocab_size is None: - vocab_size = tokenizer.vocab_size() - - for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_sentencepiece() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -780,18 +785,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layernorm_epsilon"]) def set_vocab(self): - tokens, scores, toktypes = self._get_sentencepiece_tokenizer_info() - self.gguf_writer.add_tokenizer_model('llama') - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_bos_token_id(71013) + self._set_vocab_sentencepiece() + # self.gguf_writer.add_bos_token_id(71013) # self.gguf_writer.add_eos_token_id(71013) def write_tensors(self): block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - print(tensor_map) + for name, data in self.get_tensors(): if name.endswith(".self_attention.rotary_emb.inv_freq"): continue @@ -805,38 +806,3 @@ def write_tensors(self): n_dims = len(data.shape) print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) self.gguf_writer.add_tensor(new_name, data) - - def _get_sentencepiece_tokenizer_info(self): - from sentencepiece import SentencePieceProcessor - tokenizer_path = self.dir_model / 'tokenizer.model' - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - - print('gguf: getting sentencepiece tokenizer from', tokenizer_path) - print('gguf: adding tokens') - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - pass - return tokens, scores, toktypes From 235acc18cdd7c3ee7fd1d52e10f76496b1c04b9a Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 16:23:53 +0100 Subject: [PATCH 18/42] Small refactor --- convert-generic.py | 32 ++++++++------------------------ model.py | 18 +++++++++++++++--- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index 0e69d35c90279..141d1fe72741e 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -28,41 +28,25 @@ # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model " + dir_model.name) +print(f"Loading model: {dir_model.name}") hparams = model.Model.load_hparams(dir_model) model_class = model.Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype, fname_out) -print("gguf: get model metadata") - +print("Set model parameters") model_instance.set_gguf_parameters() -# TOKENIZATION -print("gguf: get tokenizer metadata") - -print("gguf: get gpt2 tokenizer vocab") - +print("Set model tokenizer") model_instance.set_vocab() -# write model if not args.vocab_only: - model_instance.write_tensors() - print("gguf: write header") - model_instance.gguf_writer.write_header_to_file() - print("gguf: write metadata") - model_instance.gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") - model_instance.gguf_writer.write_tensors_to_file() + print(f"Exporting model to '{fname_out}'") + model_instance.write() else: - print("gguf: write header") - model_instance.gguf_writer.write_header_to_file() - print("gguf: write metadata") - model_instance.gguf_writer.write_kv_data_to_file() - -model_instance.gguf_writer.close() + print(f"Exporting model vocab to '{fname_out}'") + model_instance.write_vocab() -print(f"gguf: model successfully exported to '{fname_out}'") +print(f"Model successfully exported to '{fname_out}'") print("") diff --git a/model.py b/model.py index 5d431d6bc605d..df16de531f076 100644 --- a/model.py +++ b/model.py @@ -9,7 +9,7 @@ from enum import Enum from pathlib import Path -from typing import TypeAlias, Any +from typing import TypeAlias, Any, Generator NDArray: TypeAlias = 'np.ndarray[Any, Any]' @@ -48,7 +48,7 @@ def _get_part_names(self): return ("pytorch_model.bin",) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - def _get_model_architecture(self): + def _get_model_architecture(self) -> gguf.MODEL_ARCH: arch = self.hparams["architectures"][0] if arch == "GPTNeoXForCausalLM": return gguf.MODEL_ARCH.GPTNEOX @@ -160,7 +160,7 @@ def _set_vocab_sentencepiece(self): def set_vocab(self): self._set_vocab_gpt2() - def get_tensors(self): + def get_tensors(self) -> Generator[str, Any]: for part_name in self.part_names: print("gguf: loading model part '" + part_name + "'") if self.is_safetensors: @@ -230,6 +230,18 @@ def write_tensors(self): self.gguf_writer.add_tensor(new_name, data) + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + @staticmethod def count_model_parts(dir_model: Path, prefix: str) -> int: num_parts = 0 From f4b9a7ea02a6a1f270a06fc4670e120b010966c4 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 16:27:06 +0100 Subject: [PATCH 19/42] Remove 'old' conversion scripts --- convert-baichuan-hf-to-gguf.py | 316 -------------------------------- convert-bloom-hf-to-gguf.py | 247 ------------------------- convert-falcon-hf-to-gguf.py | 253 ------------------------- convert-gptneox-hf-to-gguf.py | 221 ---------------------- convert-mpt-hf-to-gguf.py | 227 ----------------------- convert-refact-hf-to-gguf.py | 272 --------------------------- convert-starcoder-hf-to-gguf.py | 210 --------------------- 7 files changed, 1746 deletions(-) delete mode 100755 convert-baichuan-hf-to-gguf.py delete mode 100755 convert-bloom-hf-to-gguf.py delete mode 100755 convert-falcon-hf-to-gguf.py delete mode 100755 convert-gptneox-hf-to-gguf.py delete mode 100755 convert-mpt-hf-to-gguf.py delete mode 100755 convert-refact-hf-to-gguf.py delete mode 100755 convert-starcoder-hf-to-gguf.py diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py deleted file mode 100755 index 5ee99be73134e..0000000000000 --- a/convert-baichuan-hf-to-gguf.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python3 -# HF baichuan --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any -import itertools -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - -def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: - r = weights.shape[0] // 3 - return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - -def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: - r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -endianess = gguf.GGUFEndian.LITTLE -if args.bigendian: - endianess = gguf.GGUFEndian.BIG -endianess_str = "Big Endian" if args.bigendian else "Little Endian" -print(f"gguf: Conversion Endianess {endianess}") -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) -print("hello print: ",hparams["architectures"][0]) -if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) -print(f"num_parts:{num_parts}\n") -ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -elif "model_max_length" in hparams: - ctx_length = hparams["model_max_length"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) -vocab_size = hparams.get('vocab_size') -if vocab_size is None: - vocab_size = tokenizer.vocab_size() - -for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - tmp=model_part - for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name in model_part.keys(): - data = model_part[name] - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py deleted file mode 100755 index 6e866d9434818..0000000000000 --- a/convert-bloom-hf-to-gguf.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -# HF bloom --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import re -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -# Supported Models: -# https://huggingface.co/bigscience/bloom-1b7 -# https://huggingface.co/bigscience/bloom-3b -# https://huggingface.co/bigscience/bloom-7b1 -# https://huggingface.co/Langboat/bloom-1b4-zh -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "BloomForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.BLOOM -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Bloom") -n_embed = hparams.get("hidden_size", hparams.get("n_embed")) -n_head = hparams.get("n_head", hparams.get("num_attention_heads")) -gguf_writer.add_context_length(hparams.get("seq_length", n_embed)) -gguf_writer.add_embedding_length(n_embed) -gguf_writer.add_feed_forward_length(4 * n_embed) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(n_head) -gguf_writer.add_head_count_kv(n_head) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head_kv = hparams.get("n_head_kv", n_head) -head_dim = n_embed // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - has_lm_head = True - if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys(): - has_lm_head = False - - for original_name in model_part.keys(): - data = model_part[original_name] - name = re.sub(r'transformer\.', '', original_name) - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed))), - axis=0 - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - (qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,))), - axis=0 - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - gguf_writer.add_tensor("output.weight", data) - print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py deleted file mode 100755 index 8e8f3c3f8f1e0..0000000000000 --- a/convert-falcon-hf-to-gguf.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python3 -# HF falcon--> gguf conversion - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith(prefix): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"): - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model, "model-00") -if num_parts: - is_safetensors = True - from safetensors import safe_open -else: - is_safetensors = False - num_parts = count_model_parts(dir_model, "pytorch_model-") - -ARCH=gguf.MODEL_ARCH.FALCON -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams.get("num_hidden_layers") -if block_count is None: - block_count = hparams["n_layer"] # old name - -n_head = hparams.get("num_attention_heads") -if n_head is None: - n_head = hparams["n_head"] # old name - -n_head_kv = hparams.get("num_kv_heads") -if n_head_kv is None: - n_head_kv = hparams.get("n_head_kv", 1) # old name - -gguf_writer.add_name("Falcon") -gguf_writer.add_context_length(2048) # not in config.json -gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(n_head) -gguf_writer.add_head_count_kv(n_head_kv) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i]) - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -head_dim = hparams["hidden_size"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -elif is_safetensors: - part_names = ( - f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1) - ) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - if is_safetensors: - ctx = safe_open(dir_model / part_name, framework="pt", device="cpu") - else: - ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu")) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if is_safetensors else model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data = torch.cat((q,k,v)).reshape_as(data) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py deleted file mode 100755 index 02d1fdf164eea..0000000000000 --- a/convert-gptneox-hf-to-gguf.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 -# HF gptneox--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTNeoXForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.GPTNEOX -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_position_embeddings"]) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))) -gguf_writer.add_head_count(hparams["num_attention_heads"]) -gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py deleted file mode 100755 index 70d154b3f5c01..0000000000000 --- a/convert-mpt-hf-to-gguf.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 -# HF mpt--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "MPTForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.MPT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_seq_len"]) -gguf_writer.add_embedding_length(hparams["d_model"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) -gguf_writer.add_head_count(hparams["n_heads"]) -if kv_n_heads := hparams["attn_config"].get("kv_n_heads"): - gguf_writer.add_head_count_kv(kv_n_heads) -gguf_writer.add_layer_norm_eps(1e-05) -if hparams["attn_config"]["clip_qkv"] is not None: - gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) -gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but -# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to -# accomodate some "reserved" tokens; this is causing problems down the line in -# llama.cpp, so we pad the vocab with dummy tokens: - -vocab_size = hparams["vocab_size"] - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Cannot map tensor '" + name + "'") - continue # for the sake of compatibility with some old published models, don't quit - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - gguf_writer.add_tensor("output.weight", data) - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py deleted file mode 100755 index f0cfe84d81c8b..0000000000000 --- a/convert-refact-hf-to-gguf.py +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env python3 -# HF refact--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) -import gguf - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a Refact model to a GGML compatible file" - ) - parser.add_argument( - "--vocab-only", - action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", - type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", - type=int, - choices=[0, 1], - default=1, - nargs="?", - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f"Error: {args.model} is not a directory", file=sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf" - -print("gguf: loading model " + dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTRefactForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH = gguf.MODEL_ARCH.REFACT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -# Get refact feed forward dimension -hidden_dim = hparams["n_embd"] -inner_dim = 4 * hidden_dim -hidden_dim = int(2 * inner_dim / 3) -multiple_of = 256 -ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Refact") -# refact uses Alibi. So this is from config.json which might be used by training. -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) - -gguf_writer.add_feed_forward_length(ff_dim) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for i in range(block_count): - if f"transformer.h.{i}.attn.kv.weight" in model_part: - data = model_part[f"transformer.h.{i}.attn.kv.weight"] - model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ - : n_head_kv * head_dim - ] - model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ - n_head_kv * head_dim : - ] - del model_part[f"transformer.h.{i}.attn.kv.weight"] - if f"transformer.h.{i}.attn.q.weight" in model_part: - model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[ - f"transformer.h.{i}.attn.q.weight" - ] - del model_part[f"transformer.h.{i}.attn.q.weight"] - if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part: - data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] - model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] - del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): - data = data.astype(np.float16) - - print( - new_name - + ", n_dims = " - + str(n_dims) - + ", " - + str(old_dtype) - + " --> " - + str(data.dtype) - ) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py deleted file mode 100755 index a9bfed85e31ba..0000000000000 --- a/convert-starcoder-hf-to-gguf.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# HF starcoder --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTBigCodeForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.STARCODER -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("StarCoder") -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) -gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") From 3ec89dcc6911b21bf71e0a6667350c79dc680794 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 31 Oct 2023 22:23:26 +0100 Subject: [PATCH 20/42] Use 'IntEnum' instead of 'Enum' --- model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/model.py b/model.py index df16de531f076..97e3b6638e212 100644 --- a/model.py +++ b/model.py @@ -7,14 +7,14 @@ import contextlib import numpy as np -from enum import Enum +from enum import IntEnum from pathlib import Path from typing import TypeAlias, Any, Generator NDArray: TypeAlias = 'np.ndarray[Any, Any]' -class SentencePieceTokenTypes(Enum): +class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 CONTROL = 3 From 4fdd7cdf2b214428c5443e0474ade0e5311db8b5 Mon Sep 17 00:00:00 2001 From: Galunid Date: Wed, 1 Nov 2023 02:32:49 +0100 Subject: [PATCH 21/42] Review fixes, persimmon fixes --- model.py | 16 +++++++++++----- gguf-py/gguf/util.py => util.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) rename gguf-py/gguf/util.py => util.py (85%) diff --git a/model.py b/model.py index 97e3b6638e212..9789577b5bfae 100644 --- a/model.py +++ b/model.py @@ -2,7 +2,6 @@ import re import sys import json -import gguf import torch import contextlib import numpy as np @@ -11,6 +10,12 @@ from pathlib import Path from typing import TypeAlias, Any, Generator +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + + NDArray: TypeAlias = 'np.ndarray[Any, Any]' @@ -160,7 +165,7 @@ def _set_vocab_sentencepiece(self): def set_vocab(self): self._set_vocab_gpt2() - def get_tensors(self) -> Generator[str, Any]: + def get_tensors(self) -> Generator[str, Any, None]: for part_name in self.part_names: print("gguf: loading model part '" + part_name + "'") if self.is_safetensors: @@ -789,12 +794,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_name('persimmon-8b-chat') self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["ffn_hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layernorm_epsilon"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) def set_vocab(self): self._set_vocab_sentencepiece() diff --git a/gguf-py/gguf/util.py b/util.py similarity index 85% rename from gguf-py/gguf/util.py rename to util.py index fcb83b54933fd..d266b54b1b15e 100644 --- a/gguf-py/gguf/util.py +++ b/util.py @@ -3,7 +3,7 @@ from pathlib import Path def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a stablelm model to a GGML compatible file") + parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") parser.add_argument( "--vocab-only", action="store_true", help="extract only the vocab", From 8f31dc54ec1db981d46cd5aac42f3e2ba478403b Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Wed, 1 Nov 2023 23:22:04 -0400 Subject: [PATCH 22/42] fix mypy errors --- convert.py | 4 +- model.py | 107 +++++++++++++++++++++++++++-------------------------- mypy.ini | 1 + 3 files changed, 57 insertions(+), 55 deletions(-) diff --git a/convert.py b/convert.py index 0680f71ea73e8..fef9d0ce8be31 100755 --- a/convert.py +++ b/convert.py @@ -26,7 +26,7 @@ from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar import numpy as np -from sentencepiece import SentencePieceProcessor # type: ignore[import] +from sentencepiece import SentencePieceProcessor # type: ignore[import-untyped] import os if 'NO_LOCAL_GGUF' not in os.environ: @@ -337,7 +337,7 @@ def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> No def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] + from transformers.models.gpt2 import tokenization_gpt2 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} for i, _ in enumerate(tokenizer): diff --git a/model.py b/model.py index 9789577b5bfae..94f090b2d15e1 100644 --- a/model.py +++ b/model.py @@ -8,17 +8,16 @@ from enum import IntEnum from pathlib import Path -from typing import TypeAlias, Any, Generator +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, TypeAlias, cast + +if TYPE_CHECKING: + from torch import Tensor if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - - class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -80,7 +79,7 @@ def _set_vocab_gpt2(self): tokens: list[bytearray] = [] toktypes: list[int] = [] - from transformers import AutoTokenizer + from transformers import AutoTokenizer # type: ignore[attr-defined] tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size @@ -90,7 +89,8 @@ def _set_vocab_gpt2(self): for i in range(vocab_size): if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") + pad_token = f"[PAD{i}]".encode('utf-8') + tokens.append(bytearray(pad_token)) toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -165,12 +165,13 @@ def _set_vocab_sentencepiece(self): def set_vocab(self): self._set_vocab_gpt2() - def get_tensors(self) -> Generator[str, Any, None]: + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for part_name in self.part_names: print("gguf: loading model part '" + part_name + "'") + ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open - ctx = safe_open(self.dir_model / part_name, framework="pt", device="cpu") + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) else: ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) @@ -197,18 +198,18 @@ def set_gguf_parameters(self): def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data in self.get_tensors(): + for name, data_torch in self.get_tensors(): # we don't need these if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) @@ -332,19 +333,19 @@ def write_tensors(self): n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - for name, data in tensors.items(): + for name, data_torch in tensors.items(): if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): has_lm_head = False name = re.sub(r'transformer\.', '', name) - old_dtype = data.dtype + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): # Map bloom-style qkv_linear to gpt-style qkv_linear @@ -417,18 +418,18 @@ def set_gguf_parameters(self): def write_tensors(self): block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data in self.get_tensors(): + for name, data_torch in self.get_tensors(): # we don't need these if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) @@ -507,7 +508,7 @@ def set_gguf_parameters(self): if self.hparams["rope_scaling"]["type"] == "linear": self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) - def _reverse_hf_permute(self, weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head @@ -515,11 +516,11 @@ def _reverse_hf_permute(self, weights: NDArray, n_head: int, n_kv_head: int | No .swapaxes(1, 2) .reshape(weights.shape)) - def _reverse_hf_permute_part(self, weights: NDArray, n_part: int, n_head: int, n_head_kv: int | None = None) -> NDArray: + def _reverse_hf_permute_part(self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None) -> Tensor: r = weights.shape[0] // 3 return (self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)) - def _reverse_hf_part(self, weights: NDArray, n_part: int) -> NDArray: + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: r = weights.shape[0] // 3 return weights[r * n_part:r * n_part + r, ...] @@ -546,18 +547,18 @@ def write_tensors(self): model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 2) del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] - for name, data in model_kv.items(): + for name, data_torch in model_kv.items(): # we don't need these if name.endswith(".rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) @@ -625,12 +626,12 @@ def write_tensors(self): head_dim = self.hparams["hidden_size"] // n_head tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data in self.get_tensors(): - old_dtype = data.dtype + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) # QKV tensor transform # The original query_key_value tensor contains n_head_kv "kv groups", @@ -643,13 +644,13 @@ def write_tensors(self): # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: - qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data = torch.cat((q, k, v)).reshape_as(data) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) @@ -730,11 +731,11 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for i in range(block_count): if f"transformer.h.{i}.attn.kv.weight" in tensors: - data = tensors[f"transformer.h.{i}.attn.kv.weight"] - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ + weight = tensors[f"transformer.h.{i}.attn.kv.weight"] + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = weight[ : n_head_kv * head_dim ] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = weight[ n_head_kv * head_dim: ] del tensors[f"transformer.h.{i}.attn.kv.weight"] @@ -744,19 +745,19 @@ def write_tensors(self): ] del tensors[f"transformer.h.{i}.attn.q.weight"] if f"transformer.h.{i}.mlp.gate_up_proj.weight" in tensors: - data = tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] + weight = tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = weight[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = weight[ff_dim:] del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - for name, data in tensors.items(): - old_dtype = data.dtype + for name, data_torch in tensors.items(): + old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) + if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + data_torch = data_torch.to(torch.float32) - data = data.squeeze().numpy() + data = data_torch.squeeze().numpy() # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) @@ -811,12 +812,12 @@ def write_tensors(self): block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data in self.get_tensors(): + for name, data_torch in self.get_tensors(): if name.endswith(".self_attention.rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data.to(torch.float32).squeeze().numpy() + data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: print("Can not map tensor '" + name + "'") diff --git a/mypy.ini b/mypy.ini index 55c168f2d7d12..7215a05dd2516 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,3 +3,4 @@ strict = true allow_untyped_calls = true allow_untyped_defs = true allow_incomplete_defs = true +disable_error_code = import-untyped From 66ccd621026a3311d58415a83f8eac57243832aa Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Wed, 1 Nov 2023 23:26:28 -0400 Subject: [PATCH 23/42] sort imports --- convert-generic.py | 4 ++-- model.py | 10 +++++----- util.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index 141d1fe72741e..addc8fdccf9f1 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 from __future__ import annotations -from util import parse_args import sys -import model +import model +from util import parse_args args = parse_args() diff --git a/model.py b/model.py index 94f090b2d15e1..9ad5e46fbf92b 100644 --- a/model.py +++ b/model.py @@ -1,15 +1,15 @@ +import contextlib +import json import os import re import sys -import json -import torch -import contextlib -import numpy as np - from enum import IntEnum from pathlib import Path from typing import TYPE_CHECKING, Any, ContextManager, Iterator, TypeAlias, cast +import numpy as np +import torch + if TYPE_CHECKING: from torch import Tensor diff --git a/util.py b/util.py index d266b54b1b15e..3388638dedb99 100644 --- a/util.py +++ b/util.py @@ -1,7 +1,7 @@ import argparse - from pathlib import Path + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") parser.add_argument( From e9abcc9c7c3a3f9129482d1a0f874c9bd952d16d Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Thu, 2 Nov 2023 00:06:32 -0400 Subject: [PATCH 24/42] fix linter complaints --- convert-generic.py | 8 +- model.py | 427 ++++++++++++++++++++++----------------------- 2 files changed, 211 insertions(+), 224 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index addc8fdccf9f1..2336559c673f2 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -41,12 +41,12 @@ print("Set model tokenizer") model_instance.set_vocab() -if not args.vocab_only: - print(f"Exporting model to '{fname_out}'") - model_instance.write() -else: +if args.vocab_only: print(f"Exporting model vocab to '{fname_out}'") model_instance.write_vocab() +else: + print(f"Exporting model to '{fname_out}'") + model_instance.write() print(f"Model successfully exported to '{fname_out}'") print("") diff --git a/model.py b/model.py index 9ad5e46fbf92b..ef72f9c092520 100644 --- a/model.py +++ b/model.py @@ -5,7 +5,7 @@ import sys from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, TypeAlias, cast +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast import numpy as np import torch @@ -39,6 +39,128 @@ def __init__(self, dir_model: Path, ftype: int, fname_out: Path): self.model_arch = self._get_model_architecture() self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch]) + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for part_name in self.part_names: + print(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + yield name, data + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams.get( + "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + )) + if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_context_length(n_ctx) + if (n_embd := self.hparams.get("hidden_size")) is not None: + self.gguf_writer.add_embedding_length(n_embd) + if (n_ff := self.hparams.get("intermediate_size")) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + if (n_head := self.hparams.get("num_attention_head")) is not None: + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture == "StableLMEpochForCausalLM": + return StableLMModel + if model_architecture == "GPTNeoXForCausalLM": + return GPTNeoXModel + if model_architecture == "BloomForCausalLM": + return BloomModel + if model_architecture == "MPTForCausalLM": + return MPTModel + if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return BaichuanModel + if model_architecture == "FalconForCausalLM": + return FalconModel + if model_architecture == "GPTBigCodeForCausalLM": + return StarCoderModel + if model_architecture == "GPTRefactForCausalLM": + return RefactModel + if model_architecture == "PersimmonForCausalLM": + return PersimmonModel + return Model + def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -47,10 +169,10 @@ def _get_part_names(self): if self.num_parts == 1: # there's only one .safetensors file return ("model.safetensors",) return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) - else: - if self.num_parts == 1: # there's only one .bin file - return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) def _get_model_architecture(self) -> gguf.MODEL_ARCH: arch = self.hparams["architectures"][0] @@ -84,7 +206,7 @@ def _set_vocab_gpt2(self): vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() for i in range(vocab_size): @@ -162,135 +284,13 @@ def _set_vocab_sentencepiece(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for part_name in self.part_names: - print("gguf: loading model part '" + part_name + "'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - yield name, data - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))) - if "max_position_embeddings" in self.hparams: - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - if "hidden_size" in self.hparams: - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - if "intermediate_size" in self.hparams: - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - if "num_attention_head" in self.hparams: - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual( - self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - self.gguf_writer.add_tensor(new_name, data) - - def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() - self.gguf_writer.close() - - def write_vocab(self): - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.endswith(prefix): - num_parts += 1 - - return num_parts - - @staticmethod - def load_hparams(dir_model): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - return hparams - - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "StableLMEpochForCausalLM": - return StableLMModel - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture == "FalconForCausalLM": - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - return Model - class StableLMModel(Model): def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rope_pct"]*(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]))) + int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) self.gguf_writer.add_layer_norm_eps(1e-5) @@ -304,10 +304,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"]*(self.hparams["hidden_size"]//self.hparams["num_attention_heads"]))) + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual( - self.hparams["use_parallel_residual"] if "use_parallel_residual" in self.hparams else True) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) @@ -342,7 +342,7 @@ def write_tensors(self): old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) data = data_torch.squeeze().numpy() @@ -353,26 +353,30 @@ def write_tensors(self): # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) data = np.concatenate( - (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed))), - axis=0 + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + axis=0, ) print("re-format attention.linear_qkv.weight") elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) data = np.concatenate( - (qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,))), - axis=0 + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + axis=0, ) print("re-format attention.linear_qkv.bias") # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) @@ -390,13 +394,13 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) if not has_lm_head and name == "word_embeddings.weight": self.gguf_writer.add_tensor("output.weight", data) - print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") class MPTModel(Model): @@ -410,7 +414,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(self.hparams["n_heads"]) if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-05) + self.gguf_writer.add_layer_norm_eps(1e-5) if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) @@ -420,13 +424,13 @@ def write_tensors(self): tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) data = data_torch.squeeze().numpy() @@ -434,7 +438,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) @@ -452,7 +456,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -469,16 +473,8 @@ def set_vocab(self): def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] - - if "num_key_value_heads" in self.hparams: - head_count_kv = self.hparams["num_key_value_heads"] - else: - head_count_kv = head_count - - if "_name_or_path" in self.hparams: - hf_repo = self.hparams["_name_or_path"] - else: - hf_repo = "" + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") ctx_length = 0 if "max_sequence_length" in self.hparams: @@ -503,26 +499,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - if "rope_scaling" in self.hparams and self.hparams["rope_scaling"] is not None and "factor" in self.hparams["rope_scaling"]: - if "type" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"]["type"] == "linear": - self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - def _reverse_hf_permute_part(self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None) -> Tensor: - r = weights.shape[0] // 3 - return (self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) def write_tensors(self): # Collect tensors from generator object @@ -530,21 +509,17 @@ def write_tensors(self): block_count = self.hparams["num_hidden_layers"] head_count = self.hparams["num_attention_heads"] tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - if "num_key_value_heads" in self.hparams: - head_count_kv = self.hparams["num_key_value_heads"] - else: - head_count_kv = head_count + head_count_kv = self.hparams.get("num_key_value_heads", head_count) for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_kv: + if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = self._reverse_hf_permute_part( - model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = self._reverse_hf_permute_part( - model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = self._reverse_hf_part( - model_kv[f"model.layers.{i}.self_attn.W_pack.weight"], 2) + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ + self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ + self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ + self._reverse_hf_part(w, 2) del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] for name, data_torch in model_kv.items(): @@ -555,7 +530,7 @@ def write_tensors(self): old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) data = data_torch.squeeze().numpy() @@ -563,7 +538,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) @@ -581,9 +556,29 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + class FalconModel(Model): def set_gguf_parameters(self): @@ -630,7 +625,7 @@ def write_tensors(self): old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) # QKV tensor transform @@ -655,7 +650,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) @@ -673,7 +668,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -704,7 +699,7 @@ def set_gguf_parameters(self): block_count = self.hparams["n_layer"] self.gguf_writer.add_name("Refact") -# refact uses Alibi. So this is from config.json which might be used by training. + # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -730,31 +725,23 @@ def write_tensors(self): tensors = dict(self.get_tensors()) for i in range(block_count): - if f"transformer.h.{i}.attn.kv.weight" in tensors: - weight = tensors[f"transformer.h.{i}.attn.kv.weight"] - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = weight[ - : n_head_kv * head_dim - ] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = weight[ - n_head_kv * head_dim: - ] + if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] del tensors[f"transformer.h.{i}.attn.kv.weight"] - if f"transformer.h.{i}.attn.q.weight" in tensors: - tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = tensors[ - f"transformer.h.{i}.attn.q.weight" - ] + if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w del tensors[f"transformer.h.{i}.attn.q.weight"] - if f"transformer.h.{i}.mlp.gate_up_proj.weight" in tensors: - weight = tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = weight[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = weight[ff_dim:] + if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] for name, data_torch in tensors.items(): old_dtype = data_torch.dtype # convert any unsupported data types to float32 - if data_torch.dtype != torch.float16 and data_torch.dtype != torch.float32: + if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) data = data_torch.squeeze().numpy() @@ -762,7 +749,7 @@ def write_tensors(self): # map tensor names new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) @@ -780,7 +767,7 @@ def write_tensors(self): if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) @@ -820,8 +807,8 @@ def write_tensors(self): data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") + print(f"Can not map tensor {name!r}") sys.exit() n_dims = len(data.shape) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") self.gguf_writer.add_tensor(new_name, data) From 007be85087f9a3942c2af46f54e0bf5419b0a45d Mon Sep 17 00:00:00 2001 From: cebtenzzre Date: Thu, 2 Nov 2023 12:08:44 -0400 Subject: [PATCH 25/42] model.py : add missing future import --- model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/model.py b/model.py index ef72f9c092520..2ebd6881e9c09 100644 --- a/model.py +++ b/model.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import contextlib import json import os From 03c9683eb7008a1b14dd04ebe8a97640c3cf674d Mon Sep 17 00:00:00 2001 From: Galunid Date: Sat, 4 Nov 2023 20:43:29 +0100 Subject: [PATCH 26/42] Restore support for RWForCausalLM --- model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model.py b/model.py index 2ebd6881e9c09..c8ae1d51a5910 100644 --- a/model.py +++ b/model.py @@ -153,7 +153,7 @@ def from_model_architecture(model_architecture): return MPTModel if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): return BaichuanModel - if model_architecture == "FalconForCausalLM": + if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): return FalconModel if model_architecture == "GPTBigCodeForCausalLM": return StarCoderModel From fd30850576bddc3b6e49ef66fd13ae5318969bc1 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sat, 4 Nov 2023 23:01:38 +0100 Subject: [PATCH 27/42] Add big endian support --- convert-generic.py | 6 +++++- model.py | 6 ++++-- util.py | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/convert-generic.py b/convert-generic.py index 2336559c673f2..a1829a85609c5 100755 --- a/convert-generic.py +++ b/convert-generic.py @@ -28,12 +28,16 @@ # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' +is_big_endian = False +if args.bigendian is not None: + is_big_endian = True + print(f"Loading model: {dir_model.name}") hparams = model.Model.load_hparams(dir_model) model_class = model.Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype, fname_out) +model_instance = model_class(dir_model, ftype, fname_out, is_big_endian) print("Set model parameters") model_instance.set_gguf_parameters() diff --git a/model.py b/model.py index c8ae1d51a5910..d5b72882f6cf8 100644 --- a/model.py +++ b/model.py @@ -30,16 +30,18 @@ class SentencePieceTokenTypes(IntEnum): class Model: - def __init__(self, dir_model: Path, ftype: int, fname_out: Path): + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.is_safetensors = self._is_model_safetensors() self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch]) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) def set_vocab(self): self._set_vocab_gpt2() diff --git a/util.py b/util.py index 3388638dedb99..d44ae183fdff1 100644 --- a/util.py +++ b/util.py @@ -20,4 +20,6 @@ def parse_args() -> argparse.Namespace: "ftype", type=int, choices=[0, 1], default=1, nargs='?', help="output format - use 0 for float32, 1 for float16", ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + return parser.parse_args() From e64f4de189aaa29b3123b751a190179bd4ae9396 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sat, 4 Nov 2023 23:10:39 +0100 Subject: [PATCH 28/42] Revert "Remove 'old' conversion scripts" - needed for testing This reverts commit f4b9a7ea02a6a1f270a06fc4670e120b010966c4. --- convert-baichuan-hf-to-gguf.py | 316 ++++++++++++++++++++++++++++++++ convert-bloom-hf-to-gguf.py | 247 +++++++++++++++++++++++++ convert-falcon-hf-to-gguf.py | 253 +++++++++++++++++++++++++ convert-gptneox-hf-to-gguf.py | 221 ++++++++++++++++++++++ convert-mpt-hf-to-gguf.py | 227 +++++++++++++++++++++++ convert-refact-hf-to-gguf.py | 272 +++++++++++++++++++++++++++ convert-starcoder-hf-to-gguf.py | 210 +++++++++++++++++++++ 7 files changed, 1746 insertions(+) create mode 100755 convert-baichuan-hf-to-gguf.py create mode 100755 convert-bloom-hf-to-gguf.py create mode 100755 convert-falcon-hf-to-gguf.py create mode 100755 convert-gptneox-hf-to-gguf.py create mode 100755 convert-mpt-hf-to-gguf.py create mode 100755 convert-refact-hf-to-gguf.py create mode 100755 convert-starcoder-hf-to-gguf.py diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py new file mode 100755 index 0000000000000..5ee99be73134e --- /dev/null +++ b/convert-baichuan-hf-to-gguf.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +# HF baichuan --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any +import itertools +import numpy as np +import torch +from sentencepiece import SentencePieceProcessor # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +if TYPE_CHECKING: + from typing import TypeAlias + +NDArray: TypeAlias = 'np.ndarray[Any, Any]' + +# reverse HF permute back to original pth layout + + +def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + +def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: + r = weights.shape[0] // 3 + return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) + +def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: + r = weights.shape[0] // 3 + return weights[r * n_part : r * n_part + r, ...] + +def count_model_parts(dir_model: str) -> int: + num_parts = 0 + + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + + return num_parts + + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +endianess = gguf.GGUFEndian.LITTLE +if args.bigendian: + endianess = gguf.GGUFEndian.BIG +endianess_str = "Big Endian" if args.bigendian else "Little Endian" +print(f"gguf: Conversion Endianess {endianess}") +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) +print("hello print: ",hparams["architectures"][0]) +if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) +print(f"num_parts:{num_parts}\n") +ARCH=gguf.MODEL_ARCH.BAICHUAN +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) + +print("gguf: get model metadata") + +block_count = hparams["num_hidden_layers"] +head_count = hparams["num_attention_heads"] + +if "num_key_value_heads" in hparams: + head_count_kv = hparams["num_key_value_heads"] +else: + head_count_kv = head_count + +if "_name_or_path" in hparams: + hf_repo = hparams["_name_or_path"] +else: + hf_repo = "" + +if "max_sequence_length" in hparams: + ctx_length = hparams["max_sequence_length"] +elif "max_position_embeddings" in hparams: + ctx_length = hparams["max_position_embeddings"] +elif "model_max_length" in hparams: + ctx_length = hparams["model_max_length"] +else: + print("gguf: can not find ctx length parameter.") + + sys.exit() + + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_source_hf_repo(hf_repo) +gguf_writer.add_tensor_data_layout("Meta AI original pth") +gguf_writer.add_context_length(ctx_length) +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) +gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) +gguf_writer.add_head_count(head_count) +gguf_writer.add_head_count_kv(head_count_kv) +gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) + +if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: + if "type" in hparams["rope_scaling"]: + if hparams["rope_scaling"]["type"] == "linear": + gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) + + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytes] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +tokenizer_model_file = dir_model / 'tokenizer.model' +if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) + +# vocab type sentencepiece +print("gguf: get sentencepiece tokenizer vocab, scores and token types") + +tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) +vocab_size = hparams.get('vocab_size') +if vocab_size is None: + vocab_size = tokenizer.vocab_size() + +for i in range(vocab_size): + text: bytes + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) + + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 + + # toktype = 4 is user-defined = tokens from added_tokens.json + + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + +added_tokens_file = dir_model / 'added_tokens.json' +if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) + + print("gguf: get added tokens") + + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type + + +gguf_writer.add_tokenizer_model("llama") +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + tmp=model_part + for i in range(block_count): + if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: + print(f"Unpacking and permuting layer {i}") + tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) + tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) + del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name in model_part.keys(): + data = model_part[name] + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py new file mode 100755 index 0000000000000..6e866d9434818 --- /dev/null +++ b/convert-bloom-hf-to-gguf.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +# HF bloom --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import re +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +# Supported Models: +# https://huggingface.co/bigscience/bloom-1b7 +# https://huggingface.co/bigscience/bloom-3b +# https://huggingface.co/bigscience/bloom-7b1 +# https://huggingface.co/Langboat/bloom-1b4-zh +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "BloomForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.BLOOM +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("Bloom") +n_embed = hparams.get("hidden_size", hparams.get("n_embed")) +n_head = hparams.get("n_head", hparams.get("num_attention_heads")) +gguf_writer.add_context_length(hparams.get("seq_length", n_embed)) +gguf_writer.add_embedding_length(n_embed) +gguf_writer.add_feed_forward_length(4 * n_embed) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(n_head) +gguf_writer.add_head_count_kv(n_head) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +added_vocab = tokenizer.get_added_vocab() +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH, block_count) + +# params for qkv transform +n_head_kv = hparams.get("n_head_kv", n_head) +head_dim = n_embed // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + has_lm_head = True + if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys(): + has_lm_head = False + + for original_name in model_part.keys(): + data = model_part[original_name] + name = re.sub(r'transformer\.', '', original_name) + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed))), + axis=0 + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + (qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,))), + axis=0 + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + gguf_writer.add_tensor("output.weight", data) + print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py new file mode 100755 index 0000000000000..8e8f3c3f8f1e0 --- /dev/null +++ b/convert-falcon-hf-to-gguf.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +# HF falcon--> gguf conversion + +from __future__ import annotations + +import argparse +import contextlib +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith(prefix): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"): + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model, "model-00") +if num_parts: + is_safetensors = True + from safetensors import safe_open +else: + is_safetensors = False + num_parts = count_model_parts(dir_model, "pytorch_model-") + +ARCH=gguf.MODEL_ARCH.FALCON +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams.get("num_hidden_layers") +if block_count is None: + block_count = hparams["n_layer"] # old name + +n_head = hparams.get("num_attention_heads") +if n_head is None: + n_head = hparams["n_head"] # old name + +n_head_kv = hparams.get("num_kv_heads") +if n_head_kv is None: + n_head_kv = hparams.get("n_head_kv", 1) # old name + +gguf_writer.add_name("Falcon") +gguf_writer.add_context_length(2048) # not in config.json +gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(n_head) +gguf_writer.add_head_count_kv(n_head_kv) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + tokens.append(reverse_vocab[i]) + scores.append(0.0) # dummy + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +head_dim = hparams["hidden_size"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +elif is_safetensors: + part_names = ( + f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1) + ) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + if is_safetensors: + ctx = safe_open(dir_model / part_name, framework="pt", device="cpu") + else: + ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu")) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if is_safetensors else model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data = torch.cat((q,k,v)).reshape_as(data) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py new file mode 100755 index 0000000000000..02d1fdf164eea --- /dev/null +++ b/convert-gptneox-hf-to-gguf.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +# HF gptneox--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTNeoXForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.GPTNEOX +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["num_hidden_layers"] + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_context_length(hparams["max_position_embeddings"]) +gguf_writer.add_embedding_length(hparams["hidden_size"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) +gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))) +gguf_writer.add_head_count(hparams["num_attention_heads"]) +gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +added_vocab = tokenizer.get_added_vocab() +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py new file mode 100755 index 0000000000000..70d154b3f5c01 --- /dev/null +++ b/convert-mpt-hf-to-gguf.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +# HF mpt--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "MPTForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.MPT +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layers"] + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_context_length(hparams["max_seq_len"]) +gguf_writer.add_embedding_length(hparams["d_model"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) +gguf_writer.add_head_count(hparams["n_heads"]) +if kv_n_heads := hparams["attn_config"].get("kv_n_heads"): + gguf_writer.add_head_count_kv(kv_n_heads) +gguf_writer.add_layer_norm_eps(1e-05) +if hparams["attn_config"]["clip_qkv"] is not None: + gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) +gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but +# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to +# accomodate some "reserved" tokens; this is causing problems down the line in +# llama.cpp, so we pad the vocab with dummy tokens: + +vocab_size = hparams["vocab_size"] + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +added_vocab = tokenizer.get_added_vocab() +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Cannot map tensor '" + name + "'") + continue # for the sake of compatibility with some old published models, don't quit + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + # note: MPT output is tied to (same as) wte in original model; + # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ + if new_name == "token_embd.weight": + gguf_writer.add_tensor("output.weight", data) + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py new file mode 100755 index 0000000000000..f0cfe84d81c8b --- /dev/null +++ b/convert-refact-hf-to-gguf.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +# HF refact--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) +import gguf + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a Refact model to a GGML compatible file" + ) + parser.add_argument( + "--vocab-only", + action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", + type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", + type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", + type=int, + choices=[0, 1], + default=1, + nargs="?", + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f"Error: {args.model} is not a directory", file=sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf" + +print("gguf: loading model " + dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTRefactForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH = gguf.MODEL_ARCH.REFACT +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +# Get refact feed forward dimension +hidden_dim = hparams["n_embd"] +inner_dim = 4 * hidden_dim +hidden_dim = int(2 * inner_dim / 3) +multiple_of = 256 +ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + +block_count = hparams["n_layer"] + +gguf_writer.add_name("Refact") +# refact uses Alibi. So this is from config.json which might be used by training. +gguf_writer.add_context_length(hparams["n_positions"]) +gguf_writer.add_embedding_length(hparams["n_embd"]) + +gguf_writer.add_feed_forward_length(ff_dim) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +added_vocab = tokenizer.get_added_vocab() +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH, block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = 1 + +head_dim = hparams["n_embd"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for i in range(block_count): + if f"transformer.h.{i}.attn.kv.weight" in model_part: + data = model_part[f"transformer.h.{i}.attn.kv.weight"] + model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ + : n_head_kv * head_dim + ] + model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ + n_head_kv * head_dim : + ] + del model_part[f"transformer.h.{i}.attn.kv.weight"] + if f"transformer.h.{i}.attn.q.weight" in model_part: + model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[ + f"transformer.h.{i}.attn.q.weight" + ] + del model_part[f"transformer.h.{i}.attn.q.weight"] + if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part: + data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] + model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] + del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ( + ftype == 1 + and data_dtype == np.float32 + and name.endswith(".weight") + and n_dims == 2 + ): + data = data.astype(np.float16) + + print( + new_name + + ", n_dims = " + + str(n_dims) + + ", " + + str(old_dtype) + + " --> " + + str(data.dtype) + ) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py new file mode 100755 index 0000000000000..a9bfed85e31ba --- /dev/null +++ b/convert-starcoder-hf-to-gguf.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# HF starcoder --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "GPTBigCodeForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.STARCODER +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("StarCoder") +gguf_writer.add_context_length(hparams["n_positions"]) +gguf_writer.add_embedding_length(hparams["n_embd"]) +gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(hparams["n_head"]) +gguf_writer.add_head_count_kv(1) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +added_vocab = tokenizer.get_added_vocab() +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_types(toktypes) +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# params for qkv transform +n_head = hparams["n_head"] +n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 + +head_dim = hparams["n_embd"] // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") From 2120195bb12f81aa0e425aaa6123072d7a3c01e2 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sat, 4 Nov 2023 23:15:41 +0100 Subject: [PATCH 29/42] Yarn rope for baichuan --- model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/model.py b/model.py index d5b72882f6cf8..65641b4d028e1 100644 --- a/model.py +++ b/model.py @@ -505,7 +505,8 @@ def set_gguf_parameters(self): if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scale_linear(self.hparams["rope_scaling"]["factor"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) def write_tensors(self): # Collect tensors from generator object From 087f88cc157717ccf762e3b1e1f584ecee3350f3 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 5 Nov 2023 00:37:00 +0100 Subject: [PATCH 30/42] Rename convert-generic -> convert-hf-to-gguf --- convert-generic.py => convert-hf-to-gguf.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename convert-generic.py => convert-hf-to-gguf.py (100%) diff --git a/convert-generic.py b/convert-hf-to-gguf.py similarity index 100% rename from convert-generic.py rename to convert-hf-to-gguf.py From f7de892ee58fc67c004e6818a203970424bac752 Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 5 Nov 2023 00:43:56 +0100 Subject: [PATCH 31/42] Move util to gguf-py/gguf --- convert-hf-to-gguf.py | 7 ++++++- util.py => gguf-py/gguf/util.py | 0 2 files changed, 6 insertions(+), 1 deletion(-) rename util.py => gguf-py/gguf/util.py (100%) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a1829a85609c5..869eec72f60d9 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1,10 +1,15 @@ #!/usr/bin/env python3 from __future__ import annotations +from pathlib import Path +import os import sys - import model + + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) from util import parse_args args = parse_args() diff --git a/util.py b/gguf-py/gguf/util.py similarity index 100% rename from util.py rename to gguf-py/gguf/util.py From 781bc54986e4e124ed6ba60f0c656085ee52529c Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 5 Nov 2023 08:42:11 +0100 Subject: [PATCH 32/42] Move everything to convert-hf-to-gguf.py --- convert-hf-to-gguf.py | 846 +++++++++++++++++++++++++++++++++++++++++- gguf-py/gguf/util.py | 25 -- model.py | 819 ---------------------------------------- 3 files changed, 842 insertions(+), 848 deletions(-) delete mode 100644 gguf-py/gguf/util.py delete mode 100644 model.py diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 869eec72f60d9..a30725be4d7b0 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,15 +2,852 @@ from __future__ import annotations from pathlib import Path +from enum import IntEnum +from pathlib import Path +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast + import os import sys -import model +import contextlib +import re +import json +import argparse +import numpy as np +import torch +if TYPE_CHECKING: + from torch import Tensor if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -from util import parse_args +import gguf + + +###### MODEL DEFINITIONS ###### + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class Model: + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.is_safetensors = self._is_model_safetensors() + self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") + self.part_names = self._get_part_names() + self.hparams = Model.load_hparams(self.dir_model) + self.model_arch = self._get_model_architecture() + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) + + def set_vocab(self): + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for part_name in self.part_names: + print(f"gguf: loading model part '{part_name}'") + ctx: ContextManager[Any] + if self.is_safetensors: + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) + else: + ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) + + with ctx as model_part: + for name in model_part.keys(): + data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + yield name, data + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams.get( + "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), + )) + if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: + self.gguf_writer.add_context_length(n_ctx) + if (n_embd := self.hparams.get("hidden_size")) is not None: + self.gguf_writer.add_embedding_length(n_embd) + if (n_ff := self.hparams.get("intermediate_size")) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + if (n_head := self.hparams.get("num_attention_head")) is not None: + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + def write(self): + self.write_tensors() + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file() + self.gguf_writer.close() + + def write_vocab(self): + self.gguf_writer.write_header_to_file() + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.close() + + @staticmethod + def count_model_parts(dir_model: Path, prefix: str) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.endswith(prefix): + num_parts += 1 + + return num_parts + + @staticmethod + def load_hparams(dir_model): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def from_model_architecture(model_architecture): + if model_architecture == "StableLMEpochForCausalLM": + return StableLMModel + if model_architecture == "GPTNeoXForCausalLM": + return GPTNeoXModel + if model_architecture == "BloomForCausalLM": + return BloomModel + if model_architecture == "MPTForCausalLM": + return MPTModel + if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return BaichuanModel + if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): + return FalconModel + if model_architecture == "GPTBigCodeForCausalLM": + return StarCoderModel + if model_architecture == "GPTRefactForCausalLM": + return RefactModel + if model_architecture == "PersimmonForCausalLM": + return PersimmonModel + return Model + + def _is_model_safetensors(self) -> bool: + return Model.count_model_parts(self.dir_model, ".safetensors") > 0 + + def _get_part_names(self): + if self.is_safetensors: + if self.num_parts == 1: # there's only one .safetensors file + return ("model.safetensors",) + return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) + + if self.num_parts == 1: # there's only one .bin file + return ("pytorch_model.bin",) + return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) + + def _get_model_architecture(self) -> gguf.MODEL_ARCH: + arch = self.hparams["architectures"][0] + if arch == "GPTNeoXForCausalLM": + return gguf.MODEL_ARCH.GPTNEOX + if arch == "BloomForCausalLM": + return gguf.MODEL_ARCH.BLOOM + if arch == "MPTForCausalLM": + return gguf.MODEL_ARCH.MPT + if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): + return gguf.MODEL_ARCH.BAICHUAN + if arch == "FalconForCausalLM": + return gguf.MODEL_ARCH.FALCON + if arch == "GPTBigCodeForCausalLM": + return gguf.MODEL_ARCH.STARCODER + if arch == "GPTRefactForCausalLM": + return gguf.MODEL_ARCH.REFACT + if arch == "PersimmonForCausalLM": + return gguf.MODEL_ARCH.PERSIMMON + + raise NotImplementedError(f'Architecture "{arch}" not supported!') + + def _set_vocab_gpt2(self): + dir_model = self.dir_model + hparams = self.hparams + tokens: list[bytearray] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer # type: ignore[attr-defined] + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + pad_token = f"[PAD{i}]".encode('utf-8') + tokens.append(bytearray(pad_token)) + toktypes.append(gguf.TokenType.USER_DEFINED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def _set_vocab_sentencepiece(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + +class StableLMModel(Model): + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_layer_norm_eps(1e-5) + + +class GPTNeoXModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count( + int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), + ) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + + +class BloomModel(Model): + def set_gguf_parameters(self): + self.gguf_writer.add_name("Bloom") + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) + self.gguf_writer.add_embedding_length(n_embed) + self.gguf_writer.add_feed_forward_length(4 * n_embed) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams["n_layer"] + tensors = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + has_lm_head = True + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) + n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) + + for name, data_torch in tensors.items(): + if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): + has_lm_head = False + + name = re.sub(r'transformer\.', '', name) + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + axis=0, + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + self.gguf_writer.add_tensor("output.weight", data) + print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + + +class MPTModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layers"] + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): + self.gguf_writer.add_head_count_kv(kv_n_heads) + self.gguf_writer.add_layer_norm_eps(1e-5) + if self.hparams["attn_config"]["clip_qkv"] is not None: + self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + # note: MPT output is tied to (same as) wte in original model; + # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ + if new_name == "token_embd.weight": + self.gguf_writer.add_tensor("output.weight", data) + + +class BaichuanModel(Model): + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + print("gguf: can not find ctx length parameter.") + sys.exit() + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def write_tensors(self): + # Collect tensors from generator object + model_kv = dict(self.get_tensors()) + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + for i in range(block_count): + if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: + print(f"Unpacking and permuting layer {i}") + model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ + self._reverse_hf_permute_part(w, 0, head_count, head_count) + model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ + self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) + model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ + self._reverse_hf_part(w, 2) + del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] + + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def _reverse_hf_permute_part( + self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, + ) -> Tensor: + r = weights.shape[0] // 3 + return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) + + def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: + r = weights.shape[0] // 3 + return weights[r * n_part:r * n_part + r, ...] + + +class FalconModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + self.gguf_writer.add_name("Falcon") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head_kv) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams.get("num_hidden_layers") + if block_count is None: + block_count = self.hparams["n_layer"] # old name + + n_head = self.hparams.get("num_attention_heads") + if n_head is None: + n_head = self.hparams["n_head"] # old name + + n_head_kv = self.hparams.get("num_kv_heads") + if n_head_kv is None: + n_head_kv = self.hparams.get("n_head_kv", 1) # old name + + head_dim = self.hparams["hidden_size"] // n_head + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py + + if "query_key_value" in name: + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class StarCoderModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("StarCoder") + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + +class RefactModel(Model): + def set_gguf_parameters(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + block_count = self.hparams["n_layer"] + + self.gguf_writer.add_name("Refact") + # refact uses Alibi. So this is from config.json which might be used by training. + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + + self.gguf_writer.add_feed_forward_length(ff_dim) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_head_count_kv(1) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + hidden_dim = self.hparams["n_embd"] + inner_dim = 4 * hidden_dim + hidden_dim = int(2 * inner_dim / 3) + multiple_of = 256 + ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + n_head = self.hparams["n_head"] + n_head_kv = 1 + head_dim = self.hparams["n_embd"] // n_head + block_count = self.hparams["n_layer"] + + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tensors = dict(self.get_tensors()) + for i in range(block_count): + if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] + tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] + del tensors[f"transformer.h.{i}.attn.kv.weight"] + if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: + tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w + del tensors[f"transformer.h.{i}.attn.q.weight"] + if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: + tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] + tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] + del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + + for name, data_torch in tensors.items(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + +class PersimmonModel(Model): + def set_gguf_parameters(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + head_count = self.hparams["num_attention_heads"] + head_count_kv = head_count + hidden_size = self.hparams["hidden_size"] + + self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + def set_vocab(self): + self._set_vocab_sentencepiece() + # self.gguf_writer.add_bos_token_id(71013) + # self.gguf_writer.add_eos_token_id(71013) + + def write_tensors(self): + block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + for name, data_torch in self.get_tensors(): + if name.endswith(".self_attention.rotary_emb.inv_freq"): + continue + old_dtype = data_torch.dtype + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + data = data_torch.to(torch.float32).squeeze().numpy() + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + n_dims = len(data.shape) + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + +###### CONVERSION LOGIC ###### + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + + return parser.parse_args() args = parse_args() @@ -39,9 +876,9 @@ print(f"Loading model: {dir_model.name}") -hparams = model.Model.load_hparams(dir_model) +hparams = Model.load_hparams(dir_model) -model_class = model.Model.from_model_architecture(hparams["architectures"][0]) +model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype, fname_out, is_big_endian) print("Set model parameters") @@ -59,3 +896,4 @@ print(f"Model successfully exported to '{fname_out}'") print("") + diff --git a/gguf-py/gguf/util.py b/gguf-py/gguf/util.py deleted file mode 100644 index d44ae183fdff1..0000000000000 --- a/gguf-py/gguf/util.py +++ /dev/null @@ -1,25 +0,0 @@ -import argparse -from pathlib import Path - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - - return parser.parse_args() diff --git a/model.py b/model.py deleted file mode 100644 index 65641b4d028e1..0000000000000 --- a/model.py +++ /dev/null @@ -1,819 +0,0 @@ -from __future__ import annotations - -import contextlib -import json -import os -import re -import sys -from enum import IntEnum -from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast - -import numpy as np -import torch - -if TYPE_CHECKING: - from torch import Tensor - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class Model: - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") - self.part_names = self._get_part_names() - self.hparams = Model.load_hparams(self.dir_model) - self.model_arch = self._get_model_architecture() - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess) - - def set_vocab(self): - self._set_vocab_gpt2() - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") - ctx: ContextManager[Any] - if self.is_safetensors: - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) - else: - ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu")) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] - yield name, data - - def set_gguf_parameters(self): - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_block_count(self.hparams.get( - "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")), - )) - if (n_ctx := self.hparams.get("max_position_embeddings")) is not None: - self.gguf_writer.add_context_length(n_ctx) - if (n_embd := self.hparams.get("hidden_size")) is not None: - self.gguf_writer.add_embedding_length(n_embd) - if (n_ff := self.hparams.get("intermediate_size")) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - if (n_head := self.hparams.get("num_attention_head")) is not None: - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - def write(self): - self.write_tensors() - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() - self.gguf_writer.close() - - def write_vocab(self): - self.gguf_writer.write_header_to_file() - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.close() - - @staticmethod - def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.endswith(prefix): - num_parts += 1 - - return num_parts - - @staticmethod - def load_hparams(dir_model): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "StableLMEpochForCausalLM": - return StableLMModel - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - return Model - - def _is_model_safetensors(self) -> bool: - return Model.count_model_parts(self.dir_model, ".safetensors") > 0 - - def _get_part_names(self): - if self.is_safetensors: - if self.num_parts == 1: # there's only one .safetensors file - return ("model.safetensors",) - return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) - - if self.num_parts == 1: # there's only one .bin file - return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - - def _get_model_architecture(self) -> gguf.MODEL_ARCH: - arch = self.hparams["architectures"][0] - if arch == "GPTNeoXForCausalLM": - return gguf.MODEL_ARCH.GPTNEOX - if arch == "BloomForCausalLM": - return gguf.MODEL_ARCH.BLOOM - if arch == "MPTForCausalLM": - return gguf.MODEL_ARCH.MPT - if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return gguf.MODEL_ARCH.BAICHUAN - if arch == "FalconForCausalLM": - return gguf.MODEL_ARCH.FALCON - if arch == "GPTBigCodeForCausalLM": - return gguf.MODEL_ARCH.STARCODER - if arch == "GPTRefactForCausalLM": - return gguf.MODEL_ARCH.REFACT - if arch == "PersimmonForCausalLM": - return gguf.MODEL_ARCH.PERSIMMON - - raise NotImplementedError(f'Architecture "{arch}" not supported!') - - def _set_vocab_gpt2(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] - toktypes: list[int] = [] - - from transformers import AutoTokenizer # type: ignore[attr-defined] - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size - - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} - added_vocab = tokenizer.get_added_vocab() - - for i in range(vocab_size): - if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) - special_vocab.add_to_gguf(self.gguf_writer) - - def _set_vocab_sentencepiece(self): - from sentencepiece import SentencePieceProcessor - - tokenizer_path = self.dir_model / 'tokenizer.model' - - tokens: list[bytes] = [] - scores: list[float] = [] - toktypes: list[int] = [] - - if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) - - tokenizer = SentencePieceProcessor(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) - text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) - - toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): - toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): - toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): - toktype = SentencePieceTokenTypes.BYTE - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - added_tokens_file = self.dir_model / 'added_tokens.json' - if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - added_tokens_json = json.load(f) - - for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) - - self.gguf_writer.add_tokenizer_model("llama") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - - special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) - special_vocab.add_to_gguf(self.gguf_writer) - - -class StableLMModel(Model): - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_layer_norm_eps(1e-5) - - -class GPTNeoXModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])), - ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - - -class BloomModel(Model): - def set_gguf_parameters(self): - self.gguf_writer.add_name("Bloom") - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed)) - self.gguf_writer.add_embedding_length(n_embed) - self.gguf_writer.add_feed_forward_length(4 * n_embed) - self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams["n_layer"] - tensors = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - has_lm_head = True - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) - n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - - for name, data_torch in tensors.items(): - if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): - has_lm_head = False - - name = re.sub(r'transformer\.', '', name) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - -class MPTModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layers"] - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) - self.gguf_writer.add_embedding_length(self.hparams["d_model"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"]) - self.gguf_writer.add_head_count(self.hparams["n_heads"]) - if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"): - self.gguf_writer.add_head_count_kv(kv_n_heads) - self.gguf_writer.add_layer_norm_eps(1e-5) - if self.hparams["attn_config"]["clip_qkv"] is not None: - self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) - - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - self.gguf_writer.add_tensor("output.weight", data) - - -class BaichuanModel(Model): - def set_vocab(self): - self._set_vocab_sentencepiece() - - def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - hf_repo = self.hparams.get("_name_or_path", "") - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - print("gguf: can not find ctx length parameter.") - sys.exit() - - self.gguf_writer.add_name(self.dir_model.name) - self.gguf_writer.add_source_hf_repo(hf_repo) - self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: - if self.hparams["rope_scaling"].get("type") == "linear": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - head_count = self.hparams["num_attention_heads"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - for i in range(block_count): - if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ - self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ - self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ - self._reverse_hf_part(w, 2) - del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return ( - weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def _reverse_hf_permute_part( - self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None, - ) -> Tensor: - r = weights.shape[0] // 3 - return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv) - - def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor: - r = weights.shape[0] // 3 - return weights[r * n_part:r * n_part + r, ...] - - -class FalconModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - self.gguf_writer.add_name("Falcon") - self.gguf_writer.add_context_length(2048) # not in config.json - self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(n_head) - self.gguf_writer.add_head_count_kv(n_head_kv) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name - - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name - - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name - - head_dim = self.hparams["hidden_size"] // n_head - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class StarCoderModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("StarCoder") - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - -class RefactModel(Model): - def set_gguf_parameters(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - - block_count = self.hparams["n_layer"] - - self.gguf_writer.add_name("Refact") - # refact uses Alibi. So this is from config.json which might be used by training. - self.gguf_writer.add_context_length(self.hparams["n_positions"]) - self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) - - self.gguf_writer.add_feed_forward_length(ff_dim) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(self.hparams["n_head"]) - self.gguf_writer.add_head_count_kv(1) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - - def write_tensors(self): - hidden_dim = self.hparams["n_embd"] - inner_dim = 4 * hidden_dim - hidden_dim = int(2 * inner_dim / 3) - multiple_of = 256 - ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - n_head = self.hparams["n_head"] - n_head_kv = 1 - head_dim = self.hparams["n_embd"] // n_head - block_count = self.hparams["n_layer"] - - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - tensors = dict(self.get_tensors()) - for i in range(block_count): - if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] - del tensors[f"transformer.h.{i}.attn.kv.weight"] - if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w - del tensors[f"transformer.h.{i}.attn.q.weight"] - if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] - del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name, data_torch in tensors.items(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - -class PersimmonModel(Model): - def set_gguf_parameters(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - head_count = self.hparams["num_attention_heads"] - head_count_kv = head_count - hidden_size = self.hparams["hidden_size"] - - self.gguf_writer.add_name('persimmon-8b-chat') - self.gguf_writer.add_embedding_length(hidden_size) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - - def set_vocab(self): - self._set_vocab_sentencepiece() - # self.gguf_writer.add_bos_token_id(71013) - # self.gguf_writer.add_eos_token_id(71013) - - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in self.get_tensors(): - if name.endswith(".self_attention.rotary_emb.inv_freq"): - continue - old_dtype = data_torch.dtype - # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data_torch.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) From fefc3db527ea58587a989c850b29374e45285bc7 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 5 Nov 2023 16:24:48 -0500 Subject: [PATCH 33/42] address review comments --- convert-hf-to-gguf.py | 2 -- convert.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index a30725be4d7b0..33c461850b573 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -895,5 +895,3 @@ def parse_args() -> argparse.Namespace: model_instance.write() print(f"Model successfully exported to '{fname_out}'") -print("") - diff --git a/convert.py b/convert.py index fef9d0ce8be31..27adaf2c467a8 100755 --- a/convert.py +++ b/convert.py @@ -26,7 +26,7 @@ from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar import numpy as np -from sentencepiece import SentencePieceProcessor # type: ignore[import-untyped] +from sentencepiece import SentencePieceProcessor import os if 'NO_LOCAL_GGUF' not in os.environ: From 648252ecda50f9f700f04a8b89e1fcdda6d07da2 Mon Sep 17 00:00:00 2001 From: Galunid Date: Mon, 6 Nov 2023 06:28:21 +0100 Subject: [PATCH 34/42] Fix flake8 complaints --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 33c461850b573..599dda20789df 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -3,7 +3,6 @@ from __future__ import annotations from pathlib import Path from enum import IntEnum -from pathlib import Path from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast @@ -849,6 +848,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() + args = parse_args() dir_model = args.model From 05fb6f4e8c57c0cb44e21006c0b24d17acac14f9 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 6 Nov 2023 15:57:54 -0500 Subject: [PATCH 35/42] sort imports --- convert-hf-to-gguf.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 599dda20789df..7628043e9e30d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 from __future__ import annotations -from pathlib import Path -from enum import IntEnum -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast - -import os -import sys +import argparse import contextlib -import re import json -import argparse +import os +import re +import sys +from enum import IntEnum +from pathlib import Path +from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast + import numpy as np import torch From 7a3433b4b673e637fd301828f2f7f198f100e4ff Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 7 Nov 2023 07:25:43 +0100 Subject: [PATCH 36/42] store_true defaults to False, not None --- convert-hf-to-gguf.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7628043e9e30d..c77c8b158957a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -870,16 +870,12 @@ def parse_args() -> argparse.Namespace: # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' -is_big_endian = False -if args.bigendian is not None: - is_big_endian = True - print(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) model_class = Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype, fname_out, is_big_endian) +model_instance = model_class(dir_model, ftype, fname_out, args.bigendian) print("Set model parameters") model_instance.set_gguf_parameters() From 73780f59392d08d9457d433458bd08334ac75ac4 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 7 Nov 2023 10:34:25 +0100 Subject: [PATCH 37/42] Change ftype from int value to str value --- convert-hf-to-gguf.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index c77c8b158957a..34f3e25f6ffe7 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -841,7 +841,7 @@ def parse_args() -> argparse.Namespace: help="directory containing model file, or model file itself (*.bin)", ) parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', + "ftype", type=str, choices=["f32", "f16"], default="f16", nargs='?', help="output format - use 0 for float32, 1 for float16", ) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") @@ -857,25 +857,20 @@ def parse_args() -> argparse.Namespace: print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string ftype_str = ["f32", "f16"] if args.outfile is not None: fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + fname_out = dir_model / f'ggml-model-{ftype}.gguf' print(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) model_class = Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype, fname_out, args.bigendian) +model_instance = model_class(dir_model, ftype_str.index(ftype), fname_out, args.bigendian) print("Set model parameters") model_instance.set_gguf_parameters() From 88b0d9effce24114426ffb34c11231db7c7f1fc2 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 7 Nov 2023 23:14:58 +0100 Subject: [PATCH 38/42] Review fixes --- convert-hf-to-gguf.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 34f3e25f6ffe7..5c57473a7cb86 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -837,14 +837,14 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input", ) parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=str, choices=["f32", "f16"], default="f16", nargs='?', + "--outtype", type=str, choices=["f32", "f16"], default="f16", nargs='?', help="output format - use 0 for float32, 1 for float16", ) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) return parser.parse_args() @@ -852,25 +852,27 @@ def parse_args() -> argparse.Namespace: args = parse_args() dir_model = args.model -ftype = args.ftype if not dir_model.is_dir(): print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) -ftype_str = ["f32", "f16"] +ftype = { + "f32": gguf.GGMLQuantizationType.F32, + "f16": gguf.GGMLQuantizationType.F16, +} if args.outfile is not None: fname_out = args.outfile else: # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype}.gguf' + fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' print(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) model_class = Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype_str.index(ftype), fname_out, args.bigendian) +model_instance = model_class(dir_model, ftype[args.outtype], fname_out, args.bigendian) print("Set model parameters") model_instance.set_gguf_parameters() From b7148838f5748c02382716309a09c221e171471f Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 7 Nov 2023 23:16:43 +0100 Subject: [PATCH 39/42] Rename variable --- convert-hf-to-gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 5c57473a7cb86..778f285e6799c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -856,7 +856,7 @@ def parse_args() -> argparse.Namespace: print(f'Error: {args.model} is not a directory', file=sys.stderr) sys.exit(1) -ftype = { +ftype_map = { "f32": gguf.GGMLQuantizationType.F32, "f16": gguf.GGMLQuantizationType.F16, } @@ -872,7 +872,7 @@ def parse_args() -> argparse.Namespace: hparams = Model.load_hparams(dir_model) model_class = Model.from_model_architecture(hparams["architectures"][0]) -model_instance = model_class(dir_model, ftype[args.outtype], fname_out, args.bigendian) +model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) print("Set model parameters") model_instance.set_gguf_parameters() From 6a83bce114f06ba5f4c84725925c7af3d6052f16 Mon Sep 17 00:00:00 2001 From: Galunid Date: Tue, 7 Nov 2023 23:48:17 +0100 Subject: [PATCH 40/42] parse_args improvements --- convert-hf-to-gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 778f285e6799c..4b87a3c52351d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -837,13 +837,13 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16"], default="f16", nargs='?', - help="output format - use 0 for float32, 1 for float16", + "--outtype", type=str, choices=["f32", "f16"], default="f16", + help="output format - use f32 for float32, f16 for float16", ) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") parser.add_argument( "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", + help="directory containing model file", ) return parser.parse_args() From 2862d16facdea1a7656d7a0d64a0d54c82ea6879 Mon Sep 17 00:00:00 2001 From: Galunid Date: Wed, 8 Nov 2023 03:47:14 +0100 Subject: [PATCH 41/42] Remove outdated scripts --- convert-baichuan-hf-to-gguf.py | 316 -------------------------------- convert-bloom-hf-to-gguf.py | 247 ------------------------- convert-falcon-hf-to-gguf.py | 253 ------------------------- convert-gptneox-hf-to-gguf.py | 221 ---------------------- convert-mpt-hf-to-gguf.py | 227 ----------------------- convert-refact-hf-to-gguf.py | 272 --------------------------- convert-starcoder-hf-to-gguf.py | 210 --------------------- 7 files changed, 1746 deletions(-) delete mode 100755 convert-baichuan-hf-to-gguf.py delete mode 100755 convert-bloom-hf-to-gguf.py delete mode 100755 convert-falcon-hf-to-gguf.py delete mode 100755 convert-gptneox-hf-to-gguf.py delete mode 100755 convert-mpt-hf-to-gguf.py delete mode 100755 convert-refact-hf-to-gguf.py delete mode 100755 convert-starcoder-hf-to-gguf.py diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py deleted file mode 100755 index 5ee99be73134e..0000000000000 --- a/convert-baichuan-hf-to-gguf.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python3 -# HF baichuan --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any -import itertools -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - -def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray: - r = weights.shape[0] // 3 - return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - -def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray: - r = weights.shape[0] // 3 - return weights[r * n_part : r * n_part + r, ...] - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -endianess = gguf.GGUFEndian.LITTLE -if args.bigendian: - endianess = gguf.GGUFEndian.BIG -endianess_str = "Big Endian" if args.bigendian else "Little Endian" -print(f"gguf: Conversion Endianess {endianess}") -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) -print("hello print: ",hparams["architectures"][0]) -if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) -print(f"num_parts:{num_parts}\n") -ARCH=gguf.MODEL_ARCH.BAICHUAN -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -elif "model_max_length" in hparams: - ctx_length = hparams["model_max_length"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) -vocab_size = hparams.get('vocab_size') -if vocab_size is None: - vocab_size = tokenizer.vocab_size() - -for i in range(vocab_size): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - tmp=model_part - for i in range(block_count): - if f"model.layers.{i}.self_attn.W_pack.weight" in model_part: - print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv) - tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2) - del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] - - for name in model_part.keys(): - data = model_part[name] - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py deleted file mode 100755 index 6e866d9434818..0000000000000 --- a/convert-bloom-hf-to-gguf.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -# HF bloom --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import re -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -# Supported Models: -# https://huggingface.co/bigscience/bloom-1b7 -# https://huggingface.co/bigscience/bloom-3b -# https://huggingface.co/bigscience/bloom-7b1 -# https://huggingface.co/Langboat/bloom-1b4-zh -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "BloomForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.BLOOM -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Bloom") -n_embed = hparams.get("hidden_size", hparams.get("n_embed")) -n_head = hparams.get("n_head", hparams.get("num_attention_heads")) -gguf_writer.add_context_length(hparams.get("seq_length", n_embed)) -gguf_writer.add_embedding_length(n_embed) -gguf_writer.add_feed_forward_length(4 * n_embed) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(n_head) -gguf_writer.add_head_count_kv(n_head) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head_kv = hparams.get("n_head_kv", n_head) -head_dim = n_embed // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - has_lm_head = True - if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys(): - has_lm_head = False - - for original_name in model_part.keys(): - data = model_part[original_name] - name = re.sub(r'transformer\.', '', original_name) - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed))), - axis=0 - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - (qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,))), - axis=0 - ) - print("re-format attention.linear_qkv.bias") - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - gguf_writer.add_tensor("output.weight", data) - print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py deleted file mode 100755 index 8e8f3c3f8f1e0..0000000000000 --- a/convert-falcon-hf-to-gguf.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python3 -# HF falcon--> gguf conversion - -from __future__ import annotations - -import argparse -import contextlib -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith(prefix): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"): - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model, "model-00") -if num_parts: - is_safetensors = True - from safetensors import safe_open -else: - is_safetensors = False - num_parts = count_model_parts(dir_model, "pytorch_model-") - -ARCH=gguf.MODEL_ARCH.FALCON -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams.get("num_hidden_layers") -if block_count is None: - block_count = hparams["n_layer"] # old name - -n_head = hparams.get("num_attention_heads") -if n_head is None: - n_head = hparams["n_head"] # old name - -n_head_kv = hparams.get("num_kv_heads") -if n_head_kv is None: - n_head_kv = hparams.get("n_head_kv", 1) # old name - -gguf_writer.add_name("Falcon") -gguf_writer.add_context_length(2048) # not in config.json -gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(n_head) -gguf_writer.add_head_count_kv(n_head_kv) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - tokens.append(reverse_vocab[i]) - scores.append(0.0) # dummy - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -head_dim = hparams["hidden_size"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -elif is_safetensors: - part_names = ( - f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1) - ) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - if is_safetensors: - ctx = safe_open(dir_model / part_name, framework="pt", device="cpu") - else: - ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu")) - - with ctx as model_part: - for name in model_part.keys(): - data = model_part.get_tensor(name) if is_safetensors else model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data = torch.cat((q,k,v)).reshape_as(data) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py deleted file mode 100755 index 02d1fdf164eea..0000000000000 --- a/convert-gptneox-hf-to-gguf.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 -# HF gptneox--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTNeoXForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.GPTNEOX -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_position_embeddings"]) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))) -gguf_writer.add_head_count(hparams["num_attention_heads"]) -gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py deleted file mode 100755 index 70d154b3f5c01..0000000000000 --- a/convert-mpt-hf-to-gguf.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python3 -# HF mpt--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file") - parser.add_argument( - "--vocab-only", action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", type=int, choices=[0, 1], default=1, nargs='?', - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "MPTForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.MPT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layers"] - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_context_length(hparams["max_seq_len"]) -gguf_writer.add_embedding_length(hparams["d_model"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) -gguf_writer.add_head_count(hparams["n_heads"]) -if kv_n_heads := hparams["attn_config"].get("kv_n_heads"): - gguf_writer.add_head_count_kv(kv_n_heads) -gguf_writer.add_layer_norm_eps(1e-05) -if hparams["attn_config"]["clip_qkv"] is not None: - gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) -gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but -# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to -# accomodate some "reserved" tokens; this is causing problems down the line in -# llama.cpp, so we pad the vocab with dummy tokens: - -vocab_size = hparams["vocab_size"] - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Cannot map tensor '" + name + "'") - continue # for the sake of compatibility with some old published models, don't quit - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - gguf_writer.add_tensor("output.weight", data) - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py deleted file mode 100755 index f0cfe84d81c8b..0000000000000 --- a/convert-refact-hf-to-gguf.py +++ /dev/null @@ -1,272 +0,0 @@ -#!/usr/bin/env python3 -# HF refact--> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import sys -from pathlib import Path - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) -import gguf - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a Refact model to a GGML compatible file" - ) - parser.add_argument( - "--vocab-only", - action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input", - ) - parser.add_argument( - "model", - type=Path, - help="directory containing model file, or model file itself (*.bin)", - ) - parser.add_argument( - "ftype", - type=int, - choices=[0, 1], - default=1, - nargs="?", - help="output format - use 0 for float32, 1 for float16", - ) - return parser.parse_args() - - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f"Error: {args.model} is not a directory", file=sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf" - -print("gguf: loading model " + dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTRefactForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH = gguf.MODEL_ARCH.REFACT -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -# Get refact feed forward dimension -hidden_dim = hparams["n_embd"] -inner_dim = 4 * hidden_dim -hidden_dim = int(2 * inner_dim / 3) -multiple_of = 256 -ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - -block_count = hparams["n_layer"] - -gguf_writer.add_name("Refact") -# refact uses Alibi. So this is from config.json which might be used by training. -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) - -gguf_writer.add_feed_forward_length(ff_dim) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH, block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for i in range(block_count): - if f"transformer.h.{i}.attn.kv.weight" in model_part: - data = model_part[f"transformer.h.{i}.attn.kv.weight"] - model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[ - : n_head_kv * head_dim - ] - model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[ - n_head_kv * head_dim : - ] - del model_part[f"transformer.h.{i}.attn.kv.weight"] - if f"transformer.h.{i}.attn.q.weight" in model_part: - model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[ - f"transformer.h.{i}.attn.q.weight" - ] - del model_part[f"transformer.h.{i}.attn.q.weight"] - if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part: - data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim] - model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:] - del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"] - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ( - ftype == 1 - and data_dtype == np.float32 - and name.endswith(".weight") - and n_dims == 2 - ): - data = data.astype(np.float16) - - print( - new_name - + ", n_dims = " - + str(n_dims) - + ", " - + str(old_dtype) - + " --> " - + str(data.dtype) - ) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py deleted file mode 100755 index a9bfed85e31ba..0000000000000 --- a/convert-starcoder-hf-to-gguf.py +++ /dev/null @@ -1,210 +0,0 @@ -#!/usr/bin/env python3 -# HF starcoder --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import Any - -import numpy as np -import torch -from transformers import AutoTokenizer # type: ignore[import] - -if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) -import gguf - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "GPTBigCodeForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit(1) - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.STARCODER -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["n_layer"] - -gguf_writer.add_name("StarCoder") -gguf_writer.add_context_length(hparams["n_positions"]) -gguf_writer.add_embedding_length(hparams["n_embd"]) -gguf_writer.add_feed_forward_length(4 * hparams["n_embd"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_head_count(hparams["n_head"]) -gguf_writer.add_head_count_kv(1) -gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) -gguf_writer.add_file_type(ftype) - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytearray] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -# gpt2 tokenizer -gguf_writer.add_tokenizer_model("gpt2") - -print("gguf: get gpt2 tokenizer vocab") - -# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py -tokenizer = AutoTokenizer.from_pretrained(dir_model) - -# The number of tokens in tokenizer.json can differ from the expected vocab size. -# This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) -assert max(tokenizer.vocab.values()) < vocab_size - -added_vocab = tokenizer.get_added_vocab() -reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - -for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.USER_DEFINED) - elif reverse_vocab[i] in added_vocab: - tokens.append(reverse_vocab[i]) - if tokenizer.added_tokens_decoder[i].special: - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - tokens.append(reverse_vocab[i]) - toktypes.append(gguf.TokenType.NORMAL) - -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_types(toktypes) -special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens)) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# params for qkv transform -n_head = hparams["n_head"] -n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 - -head_dim = hparams["n_embd"] // n_head - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(dir_model / part_name, map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") From 2789ad939298168127716f5c00824067b045213c Mon Sep 17 00:00:00 2001 From: Galunid Date: Thu, 9 Nov 2023 04:37:44 +0100 Subject: [PATCH 42/42] Fix import path --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 4b87a3c52351d..f7fe29fd4262a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -19,7 +19,7 @@ from torch import Tensor if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf