Skip to content

Commit 53635c0

Browse files
committed
py : add GPT4All conversion script
For now: copy-paste Too much time for me to deduplicate the python code
1 parent 41318d7 commit 53635c0

File tree

2 files changed

+108
-1
lines changed

2 files changed

+108
-1
lines changed

convert-gpt4all-to-ggml.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env python3
2+
3+
#
4+
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
5+
#
6+
7+
# Original by https://github.com/eiz
8+
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
9+
import argparse
10+
import glob
11+
import os
12+
import struct
13+
import sys
14+
from sentencepiece import SentencePieceProcessor
15+
16+
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
17+
18+
def parse_args():
19+
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
20+
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
21+
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
22+
return parser.parse_args()
23+
24+
def read_header(f_in):
25+
struct_fmt = "i" * (3 + len(HPARAMS))
26+
struct_size = struct.calcsize(struct_fmt)
27+
buf = f_in.read(struct_size)
28+
return struct.unpack(struct_fmt, buf)
29+
30+
def write_header(f_out, header):
31+
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
32+
33+
if magic != 0x67676d6c:
34+
raise Exception('Invalid file magic. Must be an old style ggml file.')
35+
36+
values = [
37+
0x67676d66, # magic: ggml in hex
38+
1, # file version
39+
vocab_size,
40+
dim,
41+
multiple_of,
42+
n_heads,
43+
n_layers,
44+
rot,
45+
ftype
46+
]
47+
f_out.write(struct.pack("i" * len(values), *values))
48+
49+
def write_tokens(fout, tokenizer):
50+
for i in range(tokenizer.vocab_size()):
51+
if tokenizer.is_unknown(i):
52+
text = " \u2047 ".encode("utf-8")
53+
elif tokenizer.is_control(i):
54+
text = b""
55+
elif tokenizer.is_byte(i):
56+
piece = tokenizer.id_to_piece(i)
57+
if len(piece) != 6:
58+
print(f"Invalid token: {piece}")
59+
sys.exit(1)
60+
byte_value = int(piece[3:-1], 16)
61+
text = struct.pack("B", byte_value)
62+
else:
63+
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
64+
fout.write(struct.pack("i", len(text)))
65+
fout.write(text)
66+
fout.write(struct.pack("f", tokenizer.get_score(i)))
67+
68+
# TODO: GPT4All - add extra <pad> token
69+
text = "<pad>".encode("utf-8")
70+
fout.write(struct.pack("i", len(text)))
71+
fout.write(text)
72+
fout.write(struct.pack("f", 0.0))
73+
74+
def read_tokens(f_in, tokenizer):
75+
for i in range(tokenizer.vocab_size()):
76+
len_b = f_in.read(4)
77+
(length,) = struct.unpack("i", len_b)
78+
f_in.read(length)
79+
80+
def copy_all_data(f_out, f_in):
81+
while True:
82+
buf = f_in.read(1024 * 1024)
83+
if not buf:
84+
break
85+
f_out.write(buf)
86+
87+
def convert_one_file(path_in, tokenizer):
88+
path_tmp = f"{path_in}.tmp"
89+
path_orig= f"{path_in}.orig"
90+
print(f"converting {path_in}")
91+
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
92+
write_header(f_out, read_header(f_in))
93+
read_tokens(f_in, tokenizer)
94+
write_tokens(f_out, tokenizer)
95+
copy_all_data(f_out, f_in)
96+
os.rename(path_in, path_orig)
97+
os.rename(path_tmp, path_in)
98+
99+
def main():
100+
args = parse_args()
101+
102+
tokenizer = SentencePieceProcessor(args.tokenizer_model)
103+
104+
convert_one_file(args.gpt4all_model, tokenizer)
105+
106+
if __name__ == "__main__":
107+
main()

convert-unversioned-ggml-to-ggml.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def write_header(f_out, header):
2727

2828
if magic != 0x67676d6c:
2929
raise Exception('Invalid file magic. Must be an old style ggml file.')
30-
30+
3131
values = [
3232
0x67676d66, # magic: ggml in hex
3333
1, # file version

0 commit comments

Comments
 (0)