Skip to content

Commit 315a95a

Browse files
authored
Add LoRA support (#820)
1 parent efd0564 commit 315a95a

File tree

10 files changed

+753
-41
lines changed

10 files changed

+753
-41
lines changed

convert-lora-to-ggml.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import json
2+
import os
3+
import re
4+
import struct
5+
import sys
6+
from typing import Any, Dict, Sequence, TextIO
7+
8+
import torch
9+
10+
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
11+
12+
HF_SUBLAYER_TO_GGML = {
13+
"self_attn.q_proj": "attention.wq",
14+
"self_attn.k_proj": "attention.wk",
15+
"self_attn.v_proj": "attention.wv",
16+
"self_attn.o_proj": "attention.wo",
17+
"mlp.gate_proj": "feed_forward.w1",
18+
"mlp.down_proj": "feed_forward.w2",
19+
"mlp.up_proj": "feed_forward.w3",
20+
"input_layernorm": "attention_norm",
21+
"post_attention_layernorm": "ffn_norm",
22+
# "norm": "norm",
23+
# "embed_tokens": "tok_embeddings",
24+
# "lm_head": "output",
25+
}
26+
27+
28+
def translate_tensor_name(t: str) -> str:
29+
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
30+
if match:
31+
nn = match.group(1)
32+
sub_layer = match.group(2)
33+
lora_type = match.group(3)
34+
35+
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
36+
if sub_layer_renamed is None:
37+
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
38+
sys.exit(1)
39+
40+
output_string = (
41+
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
42+
)
43+
return output_string
44+
else:
45+
print(f"Error: unrecognized tensor {t}")
46+
sys.exit(1)
47+
48+
49+
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
50+
fout.write(b"ggla"[::-1]) # magic (ggml lora)
51+
fout.write(struct.pack("i", 1)) # file version
52+
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
53+
54+
55+
def write_tensor_header(
56+
self, name: str, shape: Sequence[int], data_type: DataType
57+
) -> None:
58+
sname = name.encode("utf-8")
59+
fout.write(
60+
struct.pack(
61+
"iii",
62+
len(shape),
63+
len(sname),
64+
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
65+
)
66+
)
67+
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
68+
fout.write(sname)
69+
fout.seek((fout.tell() + 31) & -32)
70+
71+
72+
if len(sys.argv) != 2:
73+
print(f"Usage: python {sys.argv[0]} <path>")
74+
print(
75+
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
76+
)
77+
sys.exit(1)
78+
79+
input_json = os.path.join(sys.argv[1], "adapter_config.json")
80+
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
81+
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
82+
83+
model = torch.load(input_model, map_location="cpu")
84+
85+
with open(input_json, "r") as f:
86+
params = json.load(f)
87+
88+
if params["peft_type"] != "LORA":
89+
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
90+
sys.exit(1)
91+
92+
if params["fan_in_fan_out"] == True:
93+
print("Error: param fan_in_fan_out is not supported")
94+
sys.exit(1)
95+
96+
if params["bias"] is not None and params["bias"] != "none":
97+
print("Error: param bias is not supported")
98+
sys.exit(1)
99+
100+
# TODO: these seem to be layers that have been trained but without lora.
101+
# doesn't seem widely used but eventually should be supported
102+
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
103+
print("Error: param modules_to_save is not supported")
104+
sys.exit(1)
105+
106+
with open(output_path, "wb") as fout:
107+
fout.truncate()
108+
109+
write_file_header(fout, params)
110+
for k, v in model.items():
111+
if k.endswith("lora_A.weight"):
112+
if v.dtype != torch.float16 and v.dtype != torch.float32:
113+
v = v.float()
114+
v = v.T
115+
else:
116+
v = v.float()
117+
118+
t = v.numpy()
119+
tname = translate_tensor_name(k)
120+
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
121+
write_tensor_header(fout, tname, t.shape, t.dtype)
122+
t.tofile(fout)
123+
124+
print(f"Converted {input_json} and {input_model} to {output_path}")

examples/common.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
139139
break;
140140
}
141141
params.model = argv[i];
142+
} else if (arg == "--lora") {
143+
if (++i >= argc) {
144+
invalid_param = true;
145+
break;
146+
}
147+
params.lora_adapter = argv[i];
148+
params.use_mmap = false;
149+
} else if (arg == "--lora-base") {
150+
if (++i >= argc) {
151+
invalid_param = true;
152+
break;
153+
}
154+
params.lora_base = argv[i];
142155
} else if (arg == "-i" || arg == "--interactive") {
143156
params.interactive = true;
144157
} else if (arg == "--embedding") {
@@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
242255
}
243256
fprintf(stderr, " --mtest compute maximum memory usage\n");
244257
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
258+
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
259+
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
245260
fprintf(stderr, " -m FNAME, --model FNAME\n");
246261
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
247262
fprintf(stderr, "\n");

examples/common.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,12 @@ struct gpt_params {
3131

3232
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
3333
std::string prompt = "";
34-
std::string input_prefix = ""; // string to prefix user inputs with
35-
36-
34+
std::string input_prefix = ""; // string to prefix user inputs with
3735
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
3836

37+
std::string lora_adapter = ""; // lora adapter path
38+
std::string lora_base = ""; // base model path for the lora adapter
39+
3940
bool memory_f16 = true; // use f16 instead of f32 for memory kv
4041
bool random_prompt = false; // do not randomize prompt if none provided
4142
bool use_color = false; // use color to distinguish generations and inputs

examples/main/main.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
114114
}
115115
}
116116

117+
if (!params.lora_adapter.empty()) {
118+
int err = llama_apply_lora_from_file(ctx,
119+
params.lora_adapter.c_str(),
120+
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
121+
params.n_threads);
122+
if (err != 0) {
123+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
124+
return 1;
125+
}
126+
}
127+
117128
// print system information
118129
{
119130
fprintf(stderr, "\n");

examples/perplexity/perplexity.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
134134
}
135135
}
136136

137+
if (!params.lora_adapter.empty()) {
138+
int err = llama_apply_lora_from_file(ctx,
139+
params.lora_adapter.c_str(),
140+
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
141+
params.n_threads);
142+
if (err != 0) {
143+
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
144+
return 1;
145+
}
146+
}
147+
137148
// print system information
138149
{
139150
fprintf(stderr, "\n");

0 commit comments

Comments
 (0)