Skip to content

Commit d0aaff5

Browse files
thementjxhor
andauthored
py : add temporary script to convert old ggml files to newer version (#539)
Co-authored-by: Jakub Horak <[email protected]>
1 parent d0330fd commit d0aaff5

File tree

2 files changed

+101
-1
lines changed

2 files changed

+101
-1
lines changed

convert-unversioned-ggml-to-ggml.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env python3
2+
# Original by https://github.com/eiz
3+
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
4+
import argparse
5+
import glob
6+
import os
7+
import struct
8+
import sys
9+
from sentencepiece import SentencePieceProcessor
10+
11+
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
12+
13+
def parse_args():
14+
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
15+
parser.add_argument('dir_model', help='directory containing ggml .bin files')
16+
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
17+
return parser.parse_args()
18+
19+
def read_header(f_in):
20+
struct_fmt = "i" * (3 + len(HPARAMS))
21+
struct_size = struct.calcsize(struct_fmt)
22+
buf = f_in.read(struct_size)
23+
return struct.unpack(struct_fmt, buf)
24+
25+
def write_header(f_out, header):
26+
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
27+
28+
if magic != 0x67676d6c:
29+
raise Exception('Invalid file magic. Must be an old style ggml file.')
30+
31+
values = [
32+
0x67676d66, # magic: ggml in hex
33+
1, # file version
34+
vocab_size,
35+
dim,
36+
multiple_of,
37+
n_heads,
38+
n_layers,
39+
rot,
40+
ftype
41+
]
42+
f_out.write(struct.pack("i" * len(values), *values))
43+
44+
def write_tokens(fout, tokenizer):
45+
for i in range(tokenizer.vocab_size()):
46+
if tokenizer.is_unknown(i):
47+
text = " \u2047 ".encode("utf-8")
48+
elif tokenizer.is_control(i):
49+
text = b""
50+
elif tokenizer.is_byte(i):
51+
piece = tokenizer.id_to_piece(i)
52+
if len(piece) != 6:
53+
print(f"Invalid token: {piece}")
54+
sys.exit(1)
55+
byte_value = int(piece[3:-1], 16)
56+
text = struct.pack("B", byte_value)
57+
else:
58+
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
59+
fout.write(struct.pack("i", len(text)))
60+
fout.write(text)
61+
fout.write(struct.pack("f", tokenizer.get_score(i)))
62+
63+
def read_tokens(f_in, tokenizer):
64+
for i in range(tokenizer.vocab_size()):
65+
len_b = f_in.read(4)
66+
(length,) = struct.unpack("i", len_b)
67+
f_in.read(length)
68+
69+
def copy_all_data(f_out, f_in):
70+
while True:
71+
buf = f_in.read(1024 * 1024)
72+
if not buf:
73+
break
74+
f_out.write(buf)
75+
76+
def convert_one_file(path_in, tokenizer):
77+
path_tmp = f"{path_in}.tmp"
78+
path_orig= f"{path_in}.orig"
79+
print(f"converting {path_in}")
80+
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
81+
write_header(f_out, read_header(f_in))
82+
read_tokens(f_in, tokenizer)
83+
write_tokens(f_out, tokenizer)
84+
copy_all_data(f_out, f_in)
85+
os.rename(path_in, path_orig)
86+
os.rename(path_tmp, path_in)
87+
88+
def main():
89+
args = parse_args()
90+
files = []
91+
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
92+
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
93+
94+
tokenizer = SentencePieceProcessor(args.tokenizer_model)
95+
96+
for file in files:
97+
convert_one_file(file, tokenizer)
98+
99+
if __name__ == "__main__":
100+
main()

llama.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ static bool llama_model_load(
320320
uint32_t magic;
321321
fin.read((char *) &magic, sizeof(magic));
322322
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
323-
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
323+
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
324324
__func__, fname.c_str());
325325
return false;
326326
}

0 commit comments

Comments
 (0)