smallcloudai · mitya52 · Oct 9, 2023 · Oct 9, 2023 · Oct 11, 2023 · Oct 12, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,12 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 
 RUN apt-get update
 RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \
     curl \
     git \
     htop \
     tmux \
+    file \
     vim \
     expect \
     mpich \

diff --git a/README.md b/README.md
@@ -113,23 +113,22 @@ Under the hood, it uses Refact models and the best open-source models.
 
 At the moment, you can choose between the following models:
 
-| Model                                                                                | Completion | Chat | AI Toolbox | Fine-tuning |
-|--------------------------------------------------------------------------------------|------------|------|------------|-------------|
-| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim)                   | +          | +    |            | +           |
-| [starcoder/1b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)                 | +          |      |            |   +          |
-| [starcoder/3b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)             | +          |      |            |         +    |
-| [starcoder/7b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)             | +          |      |            |         +    |
-| [starcoder/15b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)             | +          |      |            |             |
-| [starcoder/15b/plus](https://huggingface.co/TheBloke/starcoderplus-GPTQ)             | +          |      |            |             |
-| [wizardcoder/15b](https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GPTQ)          | +          |      |            |             |
-| [codellama/7b](https://huggingface.co/TheBloke/CodeLlama-7B-fp16)                    | +          |      |            |             |
-| [starchat/15b/beta](https://huggingface.co/TheBloke/starchat-beta-GPTQ)              |            | +    |            |             |
-| [wizardlm/7b](https://huggingface.co/TheBloke/WizardLM-7B-V1.0-Uncensored-GPTQ)      |            | +    |            |             |
-| [wizardlm/13b](https://huggingface.co/TheBloke/WizardLM-13B-V1.1-GPTQ)               |            | +    |            |             |
-| [wizardlm/30b](https://huggingface.co/TheBloke/WizardLM-30B-GPTQ)               |            | +    |            |             |
-| [llama2/7b](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ)                    |            | +    |            |             |
-| [llama2/13b](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ)                  |            | +    |            |             |
-
+| Model                                                                           | Completion | Chat | AI Toolbox | Fine-tuning |
+|---------------------------------------------------------------------------------|------------|------|------------|-------------|
+| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim)              | +          | +    |            | +           |
+| [starcoder/1b/base](https://huggingface.co/smallcloudai/starcoderbase-1b)       | +          |      |            | +           |
+| [starcoder/3b/base](https://huggingface.co/smallcloudai/starcoderbase-3b)       | +          |      |            | +           |
+| [starcoder/7b/base](https://huggingface.co/smallcloudai/starcoderbase-7b)       | +          |      |            | +           |
+| [starcoder/15b/base](https://huggingface.co/TheBloke/starcoder-GPTQ)            | +          |      |            |             |
+| [starcoder/15b/plus](https://huggingface.co/TheBloke/starcoderplus-GPTQ)        | +          |      |            |             |
+| [wizardcoder/15b](https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GPTQ)     | +          |      |            |             |
+| [codellama/7b](https://huggingface.co/TheBloke/CodeLlama-7B-fp16)               | +          |      |            |             |
+| [starchat/15b/beta](https://huggingface.co/TheBloke/starchat-beta-GPTQ)         |            | +    |            |             |
+| [wizardlm/7b](https://huggingface.co/TheBloke/WizardLM-7B-V1.0-Uncensored-GPTQ) |            | +    |            |             |
+| [wizardlm/13b](https://huggingface.co/TheBloke/WizardLM-13B-V1.1-GPTQ)          |            | +    |            |             |
+| [wizardlm/30b](https://huggingface.co/TheBloke/WizardLM-30B-fp16)               |            | +    |            |             |
+| [llama2/7b](https://huggingface.co/TheBloke/Llama-2-7b-Chat-GPTQ)               |            | +    |            |             |
+| [llama2/13b](https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ)             |            | +    |            |             |
 
 ## Usage
 
@@ -141,10 +140,6 @@ Q: Can I run a model on CPU?
 
 A: it doesn't run on CPU yet, but it's certainly possible to implement this.
 
-Q: Sharding is disabled, why?
-
-A: It's not ready yet, but it's coming soon.
-
 ## Community & Support
 
 - Contributing [CONTRIBUTING.md](CONTRIBUTING.md)

diff --git a/known_models_db/refact_known_models/huggingface.py b/known_models_db/refact_known_models/huggingface.py
@@ -31,35 +31,32 @@
     },
     "starcoder/1b/base": {
         "backend": "transformers",
-        "model_path": "bigcode/starcoderbase-1b",
+        "model_path": "smallcloudai/starcoderbase-1b",
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 6000,
         "T": 4096,
-        "hidden": True,
         "filter_caps": ["completion", "finetune"],
     },
     "starcoder/3b/base": {
         "backend": "transformers",
-        "model_path": "bigcode/starcoderbase-3b",
+        "model_path": "smallcloudai/starcoderbase-3b",
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 9000,
         "T": 4096,
-        "hidden": True,
         "filter_caps": ["completion", "finetune"],
     },
     "starcoder/7b/base": {
         "backend": "transformers",
-        "model_path": "bigcode/starcoderbase-7b",
+        "model_path": "smallcloudai/starcoderbase-7b",
         "diff_scratchpad_class": "refact_scratchpads:ScratchpadPSM",
         "chat_scratchpad_class": None,
         "model_class_kwargs": {},
         "required_memory_mb": 18000,
         "T": 2048,
-        "hidden": True,
         "filter_caps": ["completion", "finetune"],
     },
     "wizardcoder/15b": {

diff --git a/metrics/measure_humaneval_continue.py b/metrics/measure_humaneval_continue.py
@@ -0,0 +1,89 @@
+import sys, termcolor, subprocess, json, time, random
+from copy import deepcopy
+from mpi4py import MPI
+from human_eval.data import write_jsonl, read_problems
+from human_eval.data import read_problems
+import requests
+
+
+#MODEL = "smallcloudai/Refact-1_6B-fim"
+MODEL = "Refact/1.6B"
+
+TEMPERATURE = 0.2
+TOP_P = 0.95
+TIMES = 1
+MAX_TOKENS = 256
+
+
+def run_completion_call(src_txt):
+    res = requests.post(f"http://127.0.0.1:8008/v1/completions", json={
+        "model": MODEL,
+        "max_tokens": MAX_TOKENS,
+        "stream": False,
+        "echo": True,
+        "top_p": TOP_P,
+        "temperature": TEMPERATURE,
+        "prompt": src_txt,
+        "stop": ["\n\n\n"],
+    })
+    res.raise_for_status()
+    j = res.json()
+    # print(j)
+    return j["choices"][0]["text"]
+
+
+def test_by_continuing(comm, case):
+    orig = case["prompt"].rstrip()
+    print_me = termcolor.colored(orig[:-1], "yellow")
+    if comm.size == 1:
+        print(print_me)
+    t = run_completion_call(orig)
+    uncut = t
+    lines = t.split("\n")
+    filtered = []
+    for x in lines:
+        if x.startswith(" ") or x.strip() == "":
+            filtered.append(x)
+        elif not x.startswith(" "):
+            break
+    t = "\n".join(filtered)
+    assert uncut.startswith(t)
+    print_response = termcolor.colored(t, "green") + " " + termcolor.colored(uncut[len(t):], attrs=["dark"])
+    if comm.size == 1:
+        print(print_response)
+    else:
+        print(print_me + "\n" + print_response)
+    case["completion"] = t
+
+
+if __name__ == "__main__":
+    postfix = ""
+    if len(sys.argv) > 1:
+        postfix = sys.argv[1]
+    t0 = time.time()
+    from human_eval.data import write_jsonl, read_problems
+    from human_eval.data import read_problems
+    problems = list(read_problems().values()) * TIMES
+    comm = MPI.COMM_WORLD
+    my_problems = problems[comm.rank::comm.size]
+    output = []
+    for i, case_ in enumerate(my_problems):
+        case = deepcopy(case_)
+        print("-" * 40, " rank=%i case=%i" % (comm.rank, i), "-" * 40)
+        test_by_continuing(comm, case)
+        output.append(case)
+    comm.barrier()
+    t1 = time.time()
+    tmp = comm.gather(output, root=0)
+    if comm.rank == 0:
+        all_output = [x for y in tmp for x in y]
+        output_name = "human-%s%s.jsonl" % ("continue", postfix)
+        write_jsonl(output_name, all_output)
+        res = subprocess.check_output(f"evaluate_functional_correctness {output_name}", shell=True)
+        metrics = json.loads(res.decode('utf-8').strip().split('\n')[-1].replace("'", '"'))
+        print(termcolor.colored(metrics, "magenta"))
+        tmp = "method=%s temperature=%0.2f top_p=%0.2f postfix='%s' world=%i times=%i  %s %0.2fs %s\n" % (
+            "continue", TEMPERATURE, TOP_P, postfix, comm.size, TIMES, metrics, (t1 - t0), MODEL)
+        with open("human-eval-all-results.txt", "a") as f:
+            f.write(tmp)
+        print(tmp)
diff --git a/refact_data_pipeline/finetune/finetune_utils.py b/refact_data_pipeline/finetune/finetune_utils.py
@@ -73,13 +73,24 @@ def get_active_loras(models_db: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
             active_loras = {
                 legacy_finetune_model: active_loras,
             }
+
+    def get_active_lora(model_name: str, model_info: Dict[str, Any]) -> Dict:
+        finetune_model = model_info.get("finetune_model", model_name)
+        if finetune_model not in active_loras:
+            return {}
+        else:
+            return {
+                **active_loras[finetune_model],
+                "model": model_name
+            }
+
     return {
         model_name: {
             "lora_mode": "latest-best",
-            **active_loras.get(model_name, {}),
+            **get_active_lora(model_name, model_info),
         }
         for model_name, model_info in models_db.items()
-        if "finetune" in model_info["filter_caps"]
+        if "finetune_model" in model_info or "finetune" in model_info["filter_caps"]
     }
 
 

diff --git a/refact_scratchpads/scratchpad_completion.py b/refact_scratchpads/scratchpad_completion.py
@@ -28,19 +28,19 @@ def after_token_selection(
             self.needs_upload = True
         self._tokens.append(chosen_token.item())
         if chosen_token == self.enc.EOT:
-            self.finish_reason = "eot"
+            self.finish_reason = "stop-eot"
         if chosen_token in self.stop_tokens:
-            self.finish_reason = "stoptoken"
+            self.finish_reason = "stop-token"
         if len(self._tokens) > 3:
             if self.stop_lf_lf and self._tokens[-1] == self.enc.LF and self._tokens[-2] == self.enc.LF:
-                self.finish_reason = "ins-stop-lflf"
+                self.finish_reason = "stop-lflf"
             if self.stop_lf_lf_lf:
                 if self._tokens[-3] == self.enc.LF and self._tokens[-2] == self.enc.LF and self._tokens[-1] == self.enc.LF:
-                    self.finish_reason = "ins-stop-lflflf"
+                    self.finish_reason = "stop-lflflf"
                 elif self._tokens[-2] == self.enc.LFLF and self._tokens[-1] == self.enc.LF:
-                    self.finish_reason = "ins-stop-lflflf"
+                    self.finish_reason = "stop-lflflf"
                 elif self._tokens[-2] == self.enc.LFLF and self._tokens[-1] == self.enc.LFLF:
-                    self.finish_reason = "ins-stop-lflflf"
+                    self.finish_reason = "stop-lflflf"
         return dict()
 
     def prompt(self, T: int):

diff --git a/refact_scratchpads/scratchpad_hf.py b/refact_scratchpads/scratchpad_hf.py
@@ -72,14 +72,14 @@ def after_token_selection(self, m, chosen_token: th.Tensor, **unused) -> Dict[st
         t = chosen_token.item()
 
         if t in [self._tokenizer.eos_token_id]:
-            self.finish_reason = "eot"
+            self.finish_reason = "stop-eot"
         elif t in self._special_tokens:
-            self.finish_reason = "special-token"
+            self.finish_reason = "stop-special-token"
 
         if not self.finish_reason:
             self._completion.append(t)
         if t in self._stop_tokens:
-            self.finish_reason = "stoptoken"
+            self.finish_reason = "stop-token"
 
         couple_of_tokens_decoded = self._tokenizer.decode(([self._prev_token] if self._prev_token is not None else []) + [t])
         self._prev_token = t

diff --git a/self_hosting_machinery/inference/inference_hf.py b/self_hosting_machinery/inference/inference_hf.py
@@ -268,7 +268,7 @@ def infer(self, request: Dict[str, Any], upload_proxy: UploadProxy, upload_proxy
 
                 self._model.generate(**generation_kwargs)
             if not scratchpad.finish_reason:
-                scratchpad.finish_reason = "maxlen"
+                scratchpad.finish_reason = "length"
             upload_proxy_args["ts_batch_finished"] = time.time()
             upload_proxy.upload_result(
                 **upload_proxy_args,

diff --git a/self_hosting_machinery/inference/inference_legacy.py b/self_hosting_machinery/inference/inference_legacy.py
@@ -233,7 +233,7 @@ def _generate_using_scratchpad(self,
                 break
 
         if not scratchpad.finish_reason:
-            scratchpad.finish_reason = "maxlen"
+            scratchpad.finish_reason = "length"
 
     def infer(self, request: Dict[str, Any], upload_proxy: UploadProxy, upload_proxy_args: Dict):
         request_id = request["id"]

diff --git a/self_hosting_machinery/scripts/best_lora.py b/self_hosting_machinery/scripts/best_lora.py
@@ -1,12 +1,46 @@
 import re
 import os
 import json
-from typing import Dict
 
 from self_hosting_machinery import env
 from refact_data_pipeline.finetune.finetune_utils import get_run_model_name
 from refact_data_pipeline.finetune.finetune_utils import default_finetune_model
 
+from typing import Dict, Optional
+
+
+def find_best_checkpoint(run_id: str) -> Dict[str, str]:
+    run_dir = os.path.join(env.DIR_LORAS, run_id)
+    if not os.path.isdir(run_dir):
+        raise RuntimeError(f"run_id not found")
+    checkpoints_dir = os.path.join(run_dir, "checkpoints")
+    if not os.path.isdir(checkpoints_dir):
+        raise RuntimeError(f"run_id has no checkpoints")
+
+    def checkpoint_name_to_loss(checkpoint_id: str) -> Optional[float]:
+        match = re.match(r"iter(\d+)-testloss(\d+\.\d+)", checkpoint_id)
+        if match is None:
+            return None
+        return float(match.group(2))
+
+    checkpoints = list(filter(lambda x: x[0] is not None and os.path.isdir(x[1]), [
+        (
+            checkpoint_name_to_loss(checkpoint_id),
+            os.path.join(checkpoints_dir, checkpoint_id),
+            checkpoint_id,
+        )
+        for checkpoint_id in os.listdir(checkpoints_dir)
+    ]))
+
+    if not checkpoints:
+        raise RuntimeError(f"run_id has no valid checkpoints")
+
+    best_checkpoint = min(checkpoints, key=lambda x: x[0])
+    return {
+        "best_checkpoint_id": best_checkpoint[2],
+        "path": best_checkpoint[1],
+    }
+
 
 def find_best_lora(model_name: str) -> Dict[str, str]:
     error = "no completed runs found"
@@ -74,4 +108,10 @@ def find_best_lora(model_name: str) -> Dict[str, str]:
     parser.add_argument("--model", type=str, default=default_finetune_model)
     args = parser.parse_args()
 
-    print(find_best_lora(args.model))
+    best_lora = find_best_lora(args.model)
+    try:
+        best_checkpoint = find_best_checkpoint(best_lora["latest_run_id"])
+    except RuntimeError as e:
+        best_checkpoint = None
+    print("Best LoRA", best_lora)
+    print("Best checkpoint", best_checkpoint)