Skip to content

Commit 007a8f6

Browse files
committed
Support all LLaMA models + change Q4_0 quantization storage
1 parent 5f2f970 commit 007a8f6

File tree

5 files changed

+399
-200
lines changed

5 files changed

+399
-200
lines changed

README.md

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@ The main goal is to run the model using 4-bit quantization on a MacBook.
1717

1818
This was hacked in an evening - I have no idea if it works correctly.
1919

20-
So far, I've tested just the 7B model.
21-
Here is a typical run:
20+
Here is a typical run using LLaMA-7B:
2221

2322
```java
24-
make -j && ./main -m ../LLaMA-4bit/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
25-
I llama.cpp build info:
23+
make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
24+
I llama.cpp build info:
2625
I UNAME_S: Darwin
2726
I UNAME_P: arm
2827
I UNAME_M: arm64
@@ -34,7 +33,7 @@ I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202)
3433

3534
make: Nothing to be done for `default'.
3635
main: seed = 1678486056
37-
llama_model_load: loading model from '../LLaMA-4bit/7B/ggml-model-q4_0.bin' - please wait ...
36+
llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
3837
llama_model_load: n_vocab = 32000
3938
llama_model_load: n_ctx = 512
4039
llama_model_load: n_embd = 4096
@@ -110,6 +109,8 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
110109

111110
## Usage
112111

112+
Here are the step for the LLaMA-7B model:
113+
113114
```bash
114115
# build this repo
115116
git clone https://github.com/ggerganov/llama.cpp
@@ -133,9 +134,40 @@ python3 convert-pth-to-ggml.py models/7B/ 1
133134
./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
134135
```
135136

137+
For the bigger models, there are a few extra quantization steps. For example, for LLaMA-13B, converting to FP16 format
138+
will create 2 ggml files, instead of one:
139+
140+
```bash
141+
ggml-model-f16.bin
142+
ggml-model-f16.bin.1
143+
```
144+
145+
You need to quantize each of them separately like this:
146+
147+
```bash
148+
./quantize ./models/13B/ggml-model-f16.bin ./models/13B/ggml-model-q4_0.bin 2
149+
./quantize ./models/13B/ggml-model-f16.bin.1 ./models/13B/ggml-model-q4_0.bin.1 2
150+
```
151+
152+
Everything else is the same. Simply run:
153+
154+
```bash
155+
./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 128
156+
```
157+
158+
The number of files generated for each model is as follows:
159+
160+
```
161+
7B -> 1 file
162+
13B -> 2 files
163+
33B -> 4 files
164+
65B -> 8 files
165+
```
166+
167+
When running the larger models, make sure you have enough disk space to store all the intermediate files.
168+
136169
## Limitations
137170

138-
- Currently, only LLaMA-7B is supported since I haven't figured out how to merge the tensors of the bigger models. However, in theory, you should be able to run 65B on a 64GB MacBook
139171
- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
140172
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
141173
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69

convert-pth-to-ggml.py

Lines changed: 99 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,23 @@
3333

3434
# output in the same directory as the model
3535
dir_model = sys.argv[1]
36-
fname_out = sys.argv[1] + "/ggml-model.bin"
3736

3837
fname_hparams = sys.argv[1] + "/params.json"
39-
fname_model = sys.argv[1] + "/consolidated.00.pth"
4038
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
4139

40+
def get_n_parts(dim):
41+
if dim == 4096:
42+
return 1
43+
elif dim == 5120:
44+
return 2
45+
elif dim == 6656:
46+
return 4
47+
elif dim == 8192:
48+
return 8
49+
else:
50+
print("Invalid dim: " + str(dim))
51+
sys.exit(1)
52+
4253
# possible data types
4354
# ftype == 0 -> float32
4455
# ftype == 1 -> float16
@@ -61,76 +72,91 @@
6172

6273
hparams.update({"vocab_size": tokenizer.vocab_size()})
6374

75+
n_parts = get_n_parts(hparams["dim"])
76+
6477
print(hparams)
78+
print('n_parts = ', n_parts)
6579

66-
model = torch.load(fname_model, map_location="cpu")
67-
68-
fout = open(fname_out, "wb")
69-
70-
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
71-
fout.write(struct.pack("i", hparams["vocab_size"]))
72-
fout.write(struct.pack("i", hparams["dim"]))
73-
fout.write(struct.pack("i", hparams["multiple_of"]))
74-
fout.write(struct.pack("i", hparams["n_heads"]))
75-
fout.write(struct.pack("i", hparams["n_layers"]))
76-
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
77-
fout.write(struct.pack("i", ftype))
78-
79-
# Is this correct??
80-
for i in range(32000):
81-
# TODO: this is probably wrong - not sure how this tokenizer works
82-
text = tokenizer.decode([29889, i]).encode('utf-8')
83-
# remove the first byte (it's always '.')
84-
text = text[1:]
85-
fout.write(struct.pack("i", len(text)))
86-
fout.write(text)
87-
88-
for k, v in model.items():
89-
name = k
90-
shape = v.shape
91-
92-
# skip layers.X.attention.inner_attention.rope.freqs
93-
if name[-5:] == "freqs":
94-
continue
95-
96-
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
97-
98-
#data = tf.train.load_variable(dir_model, name).squeeze()
99-
data = v.numpy().squeeze()
100-
n_dims = len(data.shape);
101-
102-
# for efficiency - transpose some matrices
103-
# "model/h.*/attn/c_attn/w"
104-
# "model/h.*/attn/c_proj/w"
105-
# "model/h.*/mlp/c_fc/w"
106-
# "model/h.*/mlp/c_proj/w"
107-
#if name[-14:] == "/attn/c_attn/w" or \
108-
# name[-14:] == "/attn/c_proj/w" or \
109-
# name[-11:] == "/mlp/c_fc/w" or \
110-
# name[-13:] == "/mlp/c_proj/w":
111-
# print(" Transposing")
112-
# data = data.transpose()
113-
114-
dshape = data.shape
115-
116-
# default type is fp16
117-
ftype_cur = 1
118-
if ftype == 0 or n_dims == 1:
119-
print(" Converting to float32")
120-
data = data.astype(np.float32)
121-
ftype_cur = 0
122-
123-
# header
124-
str = name.encode('utf-8')
125-
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
126-
for i in range(n_dims):
127-
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
128-
fout.write(str);
129-
130-
# data
131-
data.tofile(fout)
132-
133-
fout.close()
134-
135-
print("Done. Output file: " + fname_out)
136-
print("")
80+
for p in range(n_parts):
81+
print('Processing part ', p)
82+
83+
#fname_model = sys.argv[1] + "/consolidated.00.pth"
84+
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
85+
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
86+
if (p > 0):
87+
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
88+
89+
model = torch.load(fname_model, map_location="cpu")
90+
91+
fout = open(fname_out, "wb")
92+
93+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
94+
fout.write(struct.pack("i", hparams["vocab_size"]))
95+
fout.write(struct.pack("i", hparams["dim"]))
96+
fout.write(struct.pack("i", hparams["multiple_of"]))
97+
fout.write(struct.pack("i", hparams["n_heads"]))
98+
fout.write(struct.pack("i", hparams["n_layers"]))
99+
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
100+
fout.write(struct.pack("i", ftype))
101+
102+
# Is this correct??
103+
for i in range(32000):
104+
# TODO: this is probably wrong - not sure how this tokenizer works
105+
text = tokenizer.decode([29889, i]).encode('utf-8')
106+
# remove the first byte (it's always '.')
107+
text = text[1:]
108+
fout.write(struct.pack("i", len(text)))
109+
fout.write(text)
110+
111+
for k, v in model.items():
112+
name = k
113+
shape = v.shape
114+
115+
# skip layers.X.attention.inner_attention.rope.freqs
116+
if name[-5:] == "freqs":
117+
continue
118+
119+
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
120+
121+
#data = tf.train.load_variable(dir_model, name).squeeze()
122+
data = v.numpy().squeeze()
123+
n_dims = len(data.shape);
124+
125+
# for efficiency - transpose some matrices
126+
# "model/h.*/attn/c_attn/w"
127+
# "model/h.*/attn/c_proj/w"
128+
# "model/h.*/mlp/c_fc/w"
129+
# "model/h.*/mlp/c_proj/w"
130+
#if name[-14:] == "/attn/c_attn/w" or \
131+
# name[-14:] == "/attn/c_proj/w" or \
132+
# name[-11:] == "/mlp/c_fc/w" or \
133+
# name[-13:] == "/mlp/c_proj/w":
134+
# print(" Transposing")
135+
# data = data.transpose()
136+
137+
dshape = data.shape
138+
139+
# default type is fp16
140+
ftype_cur = 1
141+
if ftype == 0 or n_dims == 1:
142+
print(" Converting to float32")
143+
data = data.astype(np.float32)
144+
ftype_cur = 0
145+
146+
# header
147+
sname = name.encode('utf-8')
148+
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
149+
for i in range(n_dims):
150+
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
151+
fout.write(sname);
152+
153+
# data
154+
data.tofile(fout)
155+
156+
# I hope this deallocates the memory ..
157+
model = None
158+
159+
fout.close()
160+
161+
print("Done. Output file: " + fname_out + ", (part ", p, ")")
162+
print("")

0 commit comments

Comments
 (0)