ggml-org
diff --git a/‎convert-lora-to-ggml.py
Lines changed: 124 additions & 0 deletions b/‎convert-lora-to-ggml.py
Lines changed: 124 additions & 0 deletions
diff --git a/‎examples/common.cpp
Lines changed: 15 additions & 0 deletions b/‎examples/common.cpp
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/common.h
Lines changed: 4 additions & 3 deletions b/‎examples/common.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/main/main.cpp
Lines changed: 11 additions & 0 deletions b/‎examples/main/main.cpp
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/perplexity/perplexity.cpp
Lines changed: 11 additions & 0 deletions b/‎examples/perplexity/perplexity.cpp
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,124 @@
+import json
+import os
+import re
+import struct
+import sys
+from typing import Any, Dict, Sequence, TextIO
+
+import torch
+
+from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+
+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attention.wq",
+    "self_attn.k_proj": "attention.wk",
+    "self_attn.v_proj": "attention.wv",
+    "self_attn.o_proj": "attention.wo",
+    "mlp.gate_proj": "feed_forward.w1",
+    "mlp.down_proj": "feed_forward.w2",
+    "mlp.up_proj": "feed_forward.w3",
+    "input_layernorm": "attention_norm",
+    "post_attention_layernorm": "ffn_norm",
+    # "norm": "norm",
+    # "embed_tokens": "tok_embeddings",
+    # "lm_head": "output",
+}
+
+
+def translate_tensor_name(t: str) -> str:
+    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            sys.exit(1)
+
+        output_string = (
+            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        sys.exit(1)
+
+
+def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
+
+
+def write_tensor_header(
+    self, name: str, shape: Sequence[int], data_type: DataType
+) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if len(sys.argv) != 2:
+    print(f"Usage: python {sys.argv[0]} <path>")
+    print(
+        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+    )
+    sys.exit(1)
+
+input_json = os.path.join(sys.argv[1], "adapter_config.json")
+input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+model = torch.load(input_model, map_location="cpu")
+
+with open(input_json, "r") as f:
+    params = json.load(f)
+
+if params["peft_type"] != "LORA":
+    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+    sys.exit(1)
+
+if params["fan_in_fan_out"] == True:
+    print("Error: param fan_in_fan_out is not supported")
+    sys.exit(1)
+
+if params["bias"] is not None and params["bias"] != "none":
+    print("Error: param bias is not supported")
+    sys.exit(1)
+
+# TODO: these seem to be layers that have been trained but without lora.
+# doesn't seem widely used but eventually should be supported
+if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    print("Error: param modules_to_save is not supported")
+    sys.exit(1)
+
+with open(output_path, "wb") as fout:
+    fout.truncate()
+
+    write_file_header(fout, params)
+    for k, v in model.items():
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
+                v = v.float()
+            v = v.T
+        else:
+            v = v.float()
+
+        t = v.numpy()
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
+        t.tofile(fout)
+
+print(f"Converted {input_json} and {input_model} to {output_path}")
@@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
@@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     }
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stderr, "  -m FNAME, --model FNAME\n");
     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
     fprintf(stderr, "\n");
 
@@ -31,11 +31,12 @@ struct gpt_params {
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
-
-
+    std::string input_prefix = "";       // string to prefix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base = "";     // base model path for the lora adapter
+
     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
 
@@ -114,6 +114,17 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
     // print system information
     {
         fprintf(stderr, "\n");
 
@@ -134,6 +134,17 @@ int main(int argc, char ** argv) {
         }
     }
 
+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
     // print system information
     {
         fprintf(stderr, "\n");
Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,17 @@ int main(int argc, char ** argv) {`
`114`	`114`	`}`
`115`	`115`	`}`
`116`	`116`
	`117`	`+ if (!params.lora_adapter.empty()) {`
	`118`	`+ int err = llama_apply_lora_from_file(ctx,`
	`119`	`+ params.lora_adapter.c_str(),`
	`120`	`+ params.lora_base.empty() ? NULL : params.lora_base.c_str(),`
	`121`	`+ params.n_threads);`
	`122`	`+ if (err != 0) {`
	`123`	`+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);`
	`124`	`+ return 1;`
	`125`	`+ }`
	`126`	`+ }`
	`127`	`+`
`117`	`128`	`// print system information`
`118`	`129`	`{`
`119`	`130`	`fprintf(stderr, "\n");`
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,17 @@ int main(int argc, char ** argv) {`
`134`	`134`	`}`
`135`	`135`	`}`
`136`	`136`
	`137`	`+ if (!params.lora_adapter.empty()) {`
	`138`	`+ int err = llama_apply_lora_from_file(ctx,`
	`139`	`+ params.lora_adapter.c_str(),`
	`140`	`+ params.lora_base.empty() ? NULL : params.lora_base.c_str(),`
	`141`	`+ params.n_threads);`
	`142`	`+ if (err != 0) {`
	`143`	`+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);`
	`144`	`+ return 1;`
	`145`	`+ }`
	`146`	`+ }`
	`147`	`+`
`137`	`148`	`// print system information`
`138`	`149`	`{`
`139`	`150`	`fprintf(stderr, "\n");`