Skip to content

Models without Vocabulary #5798

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 77 additions & 55 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ def load(model_plus: ModelPlus) -> Params:
#

class BpeVocab:
tokenizer_model = "gpt2"
name = "bpe"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
if isinstance(self.bpe_tokenizer.get('model'), dict):
Expand Down Expand Up @@ -390,6 +393,9 @@ def __repr__(self) -> str:


class SentencePieceVocab:
tokenizer_model = "llama"
name = "spm"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
added_tokens: dict[str, int]
Expand Down Expand Up @@ -453,6 +459,9 @@ def __repr__(self) -> str:


class HfVocab:
tokenizer_model = "llama"
name = "hfft"

def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
try:
from transformers import AutoTokenizer
Expand Down Expand Up @@ -553,7 +562,15 @@ def __repr__(self) -> str:
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
class NoVocab:
tokenizer_model = "no_vocab"
name = "no_vocab"

def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"


Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"


#
Expand Down Expand Up @@ -931,13 +948,19 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
yield result


def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
def check_vocab_size(params: Params, vocab: Vocab) -> None:
# Handle special case where the model's vocab size is not set
if params.n_vocab == -1:
raise ValueError(
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
)


def prepare_vocab(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
check_vocab_size(params, vocab)
if vocab.name == "no_vocab":
return

# Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size:
print("Ignoring added_tokens.json since model matches vocab size without it.")
Expand Down Expand Up @@ -977,6 +1000,7 @@ def add_meta_arch(self, params: Params) -> None:
name = str(params.path_model.parent).split('/')[-1]

self.gguf.add_name (name)
self.gguf.add_vocab_size (params.n_vocab)
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)
Expand Down Expand Up @@ -1013,20 +1037,6 @@ def add_meta_arch(self, params: Params) -> None:
if params.ftype is not None:
self.gguf.add_file_type(params.ftype)

def handle_tokenizer_model(self, vocab: Vocab) -> str:
# Map the vocab types to the supported tokenizer models
tokenizer_model = {
SentencePieceVocab: "llama",
HfVocab: "llama",
BpeVocab: "gpt2",
}.get(type(vocab))

# Block if vocab type is not predefined
if tokenizer_model is None:
raise ValueError("Unknown vocab type: Not supported")

return tokenizer_model

def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
tokens = []
scores = []
Expand All @@ -1043,11 +1053,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list
return tokens, scores, toktypes

def add_meta_vocab(self, vocab: Vocab) -> None:
# Handle the tokenizer model
tokenizer_model = self.handle_tokenizer_model(vocab)

# Ensure that tokenizer_model is added to the GGUF model
self.gguf.add_tokenizer_model(tokenizer_model)
self.gguf.add_tokenizer_model(vocab.tokenizer_model)

# Extract model vocabulary for model conversion
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
Expand All @@ -1074,6 +1081,26 @@ def write_meta(self) -> None:
def write_tensor_info(self) -> None:
self.gguf.write_ti_data_to_file()

def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)

def close(self) -> None:
self.gguf.close()

Expand All @@ -1082,7 +1109,7 @@ def write_vocab_only(
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
prepare_vocab(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, endianess=endianess)

Expand Down Expand Up @@ -1114,14 +1141,17 @@ def write_all(
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
pad_vocab: bool = False,
) -> None:
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
prepare_vocab(params, vocab, pad_vocab=pad_vocab)

of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)
if vocab.name == "no_vocab":
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
else:
of.add_meta_vocab(vocab)
of.add_meta_special_vocab(svocab)

# tensor info
for name, lazy_tensor in model.items():
Expand All @@ -1131,24 +1161,7 @@ def write_all(
of.write_tensor_info()

# tensor data
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
if ftype == GGMLFileType.MostlyQ8_0:
ndarrays = bounded_parallel_map(
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
use_processpool_executor=True,
)
else:
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

start = time.time()
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
of.gguf.write_tensor_data(ndarray)
of.write_tensor_data(ftype, model, concurrency)

of.close()

Expand Down Expand Up @@ -1309,8 +1322,8 @@ def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")

def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe"
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocab.name == "bpe"
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
return gguf.SpecialVocab(
model_parent_path,
Expand All @@ -1319,30 +1332,34 @@ def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path:
n_vocab=n_vocab,
)

def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file {path!r}, type {vocab_type!r}")

added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab
if vocab_type == "bpe":
vocab = BpeVocab(
return BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocab_type == "spm":
vocab = SentencePieceVocab(
if vocab_type == "spm":
return SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
elif vocab_type == "hfft":
vocab = HfVocab(
if vocab_type == "hfft":
return HfVocab(
path.parent, added_tokens_path if added_tokens_path.exists() else None
)
raise ValueError(vocab_type)

def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
vocab: Vocab
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
vocab = NoVocab()
else:
raise ValueError(vocab_type)
vocab = self._create_vocab_by_path(vocab_types)
# FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab(
vocab,
vocab_type,
model_parent_path,
)
return vocab, special_vocab
Expand Down Expand Up @@ -1380,6 +1397,7 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
Expand All @@ -1392,6 +1410,10 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")

args = parser.parse_args(args_in)
if args.no_vocab:
if args.vocab_only:
raise ValueError("no need to specify --vocab-only if using --no-vocab")
args.vocab_type = "no_vocab"

if args.dump_single:
model_plus = lazy_load_file(args.model)
Expand Down Expand Up @@ -1442,7 +1464,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Wrote {outfile}")
return

if model_plus.vocab is not None and args.vocab_dir is None:
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab

print(f"Vocab info: {vocab}")
Expand Down
2 changes: 2 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class General:
FILE_TYPE = "general.file_type"

class LLM:
VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length"
BLOCK_COUNT = "{arch}.block_count"
Expand Down Expand Up @@ -711,6 +712,7 @@ def get_type(val: Any) -> GGUFValueType:
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE

# LLM
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,9 @@ def add_custom_alignment(self, alignment: int) -> None:
self.data_alignment = alignment
self.add_uint32(Keys.General.ALIGNMENT, alignment)

def add_vocab_size(self, size: int) -> None:
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)

def add_context_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)

Expand Down
Loading