Skip to content

Commit fb8e2fe

Browse files
committed
Add support for loading merges.txt
Add --padvocab option to convert.py Other minor cleanups
1 parent 2833a6f commit fb8e2fe

File tree

3 files changed

+62
-8
lines changed

3 files changed

+62
-8
lines changed

convert.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -779,20 +779,29 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
779779
break
780780
yield result
781781

782-
def check_vocab_size(params: Params, vocab: Vocab) -> None:
782+
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
783783
if params.n_vocab != vocab.vocab_size:
784784
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
785785
if params.n_vocab == vocab.vocab_size_base:
786786
print("Ignoring added_tokens.json since model matches vocab size without it.")
787787
vocab.added_tokens_list = []
788788
vocab.vocab_size = vocab.vocab_size_base
789789
return
790+
if pad_vocab and params.n_vocab > vocab.vocab_size:
791+
pad_count = params.n_vocab - vocab.vocab_size
792+
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
793+
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
794+
vocab.added_tokens_list.append(f'<dummy{i:05}>')
795+
vocab.vocab_size = params.n_vocab
796+
return
790797
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
791798
if vocab.fname_added_tokens is not None:
792799
msg += f" combined with {vocab.fname_added_tokens}"
793800
msg += f" has {vocab.vocab_size})."
794801
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
795802
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
803+
if vocab.vocab_size < params.n_vocab:
804+
msg += " Possibly try using the --padvocab option."
796805
raise Exception(msg)
797806

798807

@@ -877,8 +886,12 @@ def close(self) -> None:
877886
self.gguf.close()
878887

879888
@staticmethod
880-
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
881-
check_vocab_size(params, vocab)
889+
def write_vocab_only(
890+
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
891+
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
892+
pad_vocab: bool = False,
893+
) -> None:
894+
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
882895

883896
of = OutputFile(fname_out, endianess=endianess)
884897

@@ -905,8 +918,14 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
905918
return dt.quantize(arr)
906919

907920
@staticmethod
908-
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
909-
check_vocab_size(params, vocab)
921+
def write_all(
922+
fname_out : Path, ftype: GGMLFileType, params: Params,
923+
model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
924+
concurrency: int = DEFAULT_CONCURRENCY,
925+
endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
926+
pad_vocab : bool = False,
927+
) -> None:
928+
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
910929

911930
of = OutputFile(fname_out, endianess=endianess)
912931

@@ -1126,6 +1145,7 @@ def main(args_in: list[str] | None = None) -> None:
11261145
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
11271146
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
11281147
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
1148+
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
11291149

11301150
args = parser.parse_args(args_in)
11311151
if args.dump_single:
@@ -1173,7 +1193,8 @@ def main(args_in: list[str] | None = None) -> None:
11731193
load_merges = args.vocabtype == 'bpe',
11741194
n_vocab = vocab.vocab_size)
11751195
outfile = args.outfile
1176-
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
1196+
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
1197+
endianess = endianess, pad_vocab = args.padvocab)
11771198
print(f"Wrote {outfile}")
11781199
return
11791200

@@ -1196,7 +1217,8 @@ def main(args_in: list[str] | None = None) -> None:
11961217
params.ftype = ftype
11971218
print(f"Writing {outfile}, format {ftype}")
11981219

1199-
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
1220+
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
1221+
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
12001222
print(f"Wrote {outfile}")
12011223

12021224

gguf-py/gguf/gguf.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,35 @@ def __init__(
10231023
def _load(self, path: Path) -> None:
10241024
if not self._try_load_from_tokenizer_json(path):
10251025
self._try_load_from_config_json(path)
1026+
if self.load_merges and len(self.merges) == 0:
1027+
self._try_load_merges_txt(path)
1028+
1029+
def _try_load_merges_txt(self, path: Path) -> bool:
1030+
merges_file = path / 'merges.txt'
1031+
if not merges_file.is_file():
1032+
return False
1033+
with open(merges_file, 'r') as fp:
1034+
first_line = next(fp, '').strip()
1035+
if not first_line.startswith('#'):
1036+
fp.seek(0)
1037+
line_num = 0
1038+
else:
1039+
line_num = 1
1040+
merges = []
1041+
for line in fp:
1042+
line_num += 1
1043+
line = line.strip()
1044+
if len(line) == 0:
1045+
continue
1046+
parts = line.split(None, 3)
1047+
if len(parts) != 2:
1048+
print(f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
1049+
file = sys.stderr)
1050+
continue
1051+
merges.append(f'{parts[0]} {parts[1]}')
1052+
self.merges = merges
1053+
return True
1054+
10261055

10271056
def _set_special_token(self, typ: str, tid: Any):
10281057
if not isinstance(tid, int) or tid < 0:
@@ -1083,6 +1112,9 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
10831112
if not quiet:
10841113
print(f'gguf: Adding {len(self.merges)} merge(s).')
10851114
gw.add_token_merges(self.merges)
1115+
elif self.load_merges:
1116+
print('gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
1117+
file = sys.stderr)
10861118
for typ, tokid in self.special_token_ids.items():
10871119
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
10881120
if handler is None:

gguf-py/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "gguf"
3-
version = "0.4.5"
3+
version = "0.4.6"
44
description = "Write ML models in GGUF for GGML"
55
authors = ["GGML <[email protected]>"]
66
packages = [

0 commit comments

Comments
 (0)