From 9abe2e44d1cdfe6d87ad99ed47e91684895a6e81 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 14 Dec 2023 04:03:25 -0500
Subject: [PATCH 01/27] llama : Add ability to cancel model load

Updated llama_progress_callback so that if it returns false, the model
loading is aborted.
---
 llama.cpp | 45 ++++++++++++++++++++++++++++++++-------------
 llama.h   |  6 ++++--
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0e5ab044cdf..91cd929d178 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2297,7 +2297,8 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
+    // Returns false if cancelled by progress_callback
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
         size_t size_data = 0;
         size_t size_lock = 0;
         size_t size_pref = 0; // prefetch
@@ -2323,7 +2324,9 @@ struct llama_model_loader {
             GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
 
             if (progress_callback) {
-                progress_callback((float) done_size / size_data, progress_callback_user_data);
+                if (!progress_callback((float) done_size / size_data, progress_callback_user_data)) {
+                    return false;
+                }
             }
 
             // allocate temp buffer if not using mmap
@@ -2371,6 +2374,7 @@ struct llama_model_loader {
 
             done_size += ggml_nbytes(cur);
         }
+        return true;
     }
 };
 
@@ -2937,7 +2941,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
-static void llm_load_tensors(
+// Returns false if cancelled by progress_callback
+static bool llm_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
@@ -2948,6 +2953,8 @@ static void llm_load_tensors(
         void * progress_callback_user_data) {
     model.t_start_us = ggml_time_us();
 
+    bool ok = true; // if false, model load was cancelled
+
     auto & ctx     = model.ctx;
     auto & hparams = model.hparams;
 
@@ -3678,10 +3685,11 @@ static void llm_load_tensors(
     }
 #endif
 
-    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
-
+    ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
     if (progress_callback) {
-        progress_callback(1.0f, progress_callback_user_data);
+        // Even though the model is done loading, we still honor
+        // cancellation since we need to free allocations.
+        ok = ok && progress_callback(1.0f, progress_callback_user_data);
     }
 
     model.mapping = std::move(ml.mapping);
@@ -3689,9 +3697,11 @@ static void llm_load_tensors(
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = ggml_time_us() - model.t_start_us;
+    return ok;
 }
 
-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+// Returns -1 on error, -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
@@ -3712,16 +3722,18 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
             return true;
         }
 
-        llm_load_tensors(
+        if (!llm_load_tensors(
             ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
-        );
+        )) {
+            return -2;
+        }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return false;
+        return -1;
     }
 
-    return true;
+    return 0;
 }
 
 //
@@ -9017,11 +9029,18 @@ struct llama_model * llama_load_model_from_file(
                     LLAMA_LOG_INFO("\n");
                 }
             }
+            return true;
         };
     }
 
-    if (!llama_model_load(path_model, *model, params)) {
-        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+    int status = llama_model_load(path_model, *model, params);
+    GGML_ASSERT(status <= 0);
+    if (status < 0) {
+        if (status == -1) {
+            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+        } else if (status == -2) {
+            LLAMA_LOG_INFO("%s, cancelled model load\n", __func__);
+        }
         delete model;
         return nullptr;
     }
diff --git a/llama.h b/llama.h
index 45a65cacb7b..18c349d7b11 100644
--- a/llama.h
+++ b/llama.h
@@ -126,7 +126,7 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void *ctx);
 
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences
@@ -179,7 +179,9 @@ extern "C" {
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
-        // called with a progress value between 0 and 1, pass NULL to disable
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
         llama_progress_callback progress_callback;
 
         // context pointer passed to the progress callback

From 3425e627450263e873a9490632c5d060571af0c4 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 14 Dec 2023 04:47:54 -0500
Subject: [PATCH 02/27] llama : Add test for model load cancellation

---
 tests/CMakeLists.txt             |  1 +
 tests/test-model-load-cancel.cpp | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 tests/test-model-load-cancel.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e42237c7a2e..e854d27d952 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -50,6 +50,7 @@ llama_build_and_test_executable(test-grad0.cpp)
 llama_build_and_test_executable(test-backend-ops.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
+llama_build_and_test_executable(test-model-load-cancel.cpp)
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
new file mode 100644
index 00000000000..8da21af81ff
--- /dev/null
+++ b/tests/test-model-load-cancel.cpp
@@ -0,0 +1,17 @@
+#include "llama.h"
+
+#include <cstdlib>
+#include <tuple>
+
+int main(void) {
+    llama_backend_init(false);
+    auto params = llama_model_params{};
+    params.use_mmap = false;
+    params.progress_callback = [](float progress, void * ctx){
+        std::ignore = ctx;
+        return progress > 0.50;
+    };
+    auto * model = llama_load_model_from_file("../models/7B/ggml-model-f16.gguf", params);
+    llama_backend_free();
+    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
+}

From 4b1f70cb03a23fc32cc6cf5492a1c5dc86b419a9 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 14 Dec 2023 16:29:05 -0500
Subject: [PATCH 03/27] Fix bool return in llama_model_load, remove std::ignore
 use

---
 llama.cpp                        | 4 ++--
 tests/test-model-load-cancel.cpp | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 91cd929d178..3dbbe0a8059 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3700,7 +3700,7 @@ static bool llm_load_tensors(
     return ok;
 }
 
-// Returns -1 on error, -2 on cancellation via llama_progress_callback
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@@ -3719,7 +3719,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
 
         if (params.vocab_only) {
             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return true;
+            return 0;
         }
 
         if (!llm_load_tensors(
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index 8da21af81ff..ff24a595502 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -1,17 +1,16 @@
 #include "llama.h"
 
 #include <cstdlib>
-#include <tuple>
 
 int main(void) {
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
     params.progress_callback = [](float progress, void * ctx){
-        std::ignore = ctx;
+        (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file("../models/7B/ggml-model-f16.gguf", params);
+    auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }

From 1160de38f6d7f717b2fba61dcb1238ba974f8cc1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 17 Dec 2023 21:25:19 +0200
Subject: [PATCH 04/27] Update llama.cpp

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 3dbbe0a8059..e67f5e8fce2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9039,7 +9039,7 @@ struct llama_model * llama_load_model_from_file(
         if (status == -1) {
             LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
         } else if (status == -2) {
-            LLAMA_LOG_INFO("%s, cancelled model load\n", __func__);
+            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
         }
         delete model;
         return nullptr;

From 32ebd525bf7e5a87ee8a3dbaab3d92ce79fbf23d Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sun, 17 Dec 2023 14:31:03 -0500
Subject: [PATCH 05/27] Fail test if model file is missing

---
 tests/test-model-load-cancel.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index ff24a595502..cb3c012b9d9 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -1,8 +1,20 @@
 #include "llama.h"
 
+#include <cstdio>
 #include <cstdlib>
 
 int main(void) {
+    auto model_path = "models/7B/ggml-model-f16.gguf";
+    auto file = fopen(model_path, "r");
+
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    } else {
+        fprintf(stderr, "using '%s'\n", model_path);
+        fclose(file);
+    }
+
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
@@ -10,7 +22,7 @@ int main(void) {
         (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params);
+    auto * model = llama_load_model_from_file(model_path, params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }

From 2796953257ee5383fa7c8fe8fa8fc888c048fb0b Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sun, 17 Dec 2023 14:37:01 -0500
Subject: [PATCH 06/27] Revert "Fail test if model file is missing"

This reverts commit 32ebd525bf7e5a87ee8a3dbaab3d92ce79fbf23d.
---
 tests/test-model-load-cancel.cpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index cb3c012b9d9..ff24a595502 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -1,20 +1,8 @@
 #include "llama.h"
 
-#include <cstdio>
 #include <cstdlib>
 
 int main(void) {
-    auto model_path = "models/7B/ggml-model-f16.gguf";
-    auto file = fopen(model_path, "r");
-
-    if (file == nullptr) {
-        fprintf(stderr, "no model at '%s' found\n", model_path);
-        return EXIT_FAILURE;
-    } else {
-        fprintf(stderr, "using '%s'\n", model_path);
-        fclose(file);
-    }
-
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
@@ -22,7 +10,7 @@ int main(void) {
         (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file(model_path, params);
+    auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }

From 068e7c408fa4c4f6df4b88fa85da970ff60d27cc Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sun, 17 Dec 2023 22:22:42 -0500
Subject: [PATCH 07/27] Add test-model-load-cancel to Makefile

---
 Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index fb775ae5b68..6c126269bdf 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,8 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-backend-ops \
+	tests/test-model-load-cancel
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -724,3 +725,5 @@ tests/test-c.o: tests/test-c.c llama.h
 
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS)

From fe6a6fb6d185444e6c41d8627efbbb8831dc3c34 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sun, 17 Dec 2023 22:24:17 -0500
Subject: [PATCH 08/27] Revert "Revert "Fail test if model file is missing""

This reverts commit 2796953257ee5383fa7c8fe8fa8fc888c048fb0b.
---
 tests/test-model-load-cancel.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index ff24a595502..cb3c012b9d9 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -1,8 +1,20 @@
 #include "llama.h"
 
+#include <cstdio>
 #include <cstdlib>
 
 int main(void) {
+    auto model_path = "models/7B/ggml-model-f16.gguf";
+    auto file = fopen(model_path, "r");
+
+    if (file == nullptr) {
+        fprintf(stderr, "no model at '%s' found\n", model_path);
+        return EXIT_FAILURE;
+    } else {
+        fprintf(stderr, "using '%s'\n", model_path);
+        fclose(file);
+    }
+
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
@@ -10,7 +22,7 @@ int main(void) {
         (void) ctx;
         return progress > 0.50;
     };
-    auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params);
+    auto * model = llama_load_model_from_file(model_path, params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }

From 6bba3410fac22179ba45f3bd135a30c78538745e Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Sun, 17 Dec 2023 22:33:38 -0500
Subject: [PATCH 09/27] Simplify .gitignore for tests, clang-tidy fixes

---
 .gitignore                       | 16 ----------------
 tests/.gitignore                 |  2 ++
 tests/test-model-load-cancel.cpp | 10 +++++-----
 3 files changed, 7 insertions(+), 21 deletions(-)
 create mode 100644 tests/.gitignore

diff --git a/.gitignore b/.gitignore
index 76b3d286182..7b1a9f9e320 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,19 +86,3 @@ examples/jeopardy/results.txt
 
 poetry.lock
 poetry.toml
-
-# Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 00000000000..59be43b9994
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,2 @@
+*
+!*.*
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index cb3c012b9d9..926a305da86 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -4,17 +4,17 @@
 #include <cstdlib>
 
 int main(void) {
-    auto model_path = "models/7B/ggml-model-f16.gguf";
-    auto file = fopen(model_path, "r");
+    const auto * model_path = "models/7B/ggml-model-f16.gguf";
+    auto * file = fopen(model_path, "r");
 
     if (file == nullptr) {
         fprintf(stderr, "no model at '%s' found\n", model_path);
         return EXIT_FAILURE;
-    } else {
-        fprintf(stderr, "using '%s'\n", model_path);
-        fclose(file);
     }
 
+    fprintf(stderr, "using '%s'\n", model_path);
+    fclose(file);
+
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;

From fd9d247dd2ce2bd0d1d10ee394bd84f9e7e55b23 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Mon, 18 Dec 2023 04:23:20 -0500
Subject: [PATCH 10/27] Label all ctest tests

---
 tests/CMakeLists.txt | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e854d27d952..81a02dae92d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,14 +8,20 @@ endfunction()
 function(llama_test_executable name source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+    set_property(TEST ${name} PROPERTY LABELS "main")
 endfunction()
 
 function(llama_build_and_test_executable source)
+    llama_build_and_test_executable_with_label(${source} "main")
+endfunction()
+
+function(llama_build_and_test_executable_with_label source label)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE llama common)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
 endfunction()
 
 # llama_build_and_test_executable(test-double-float.cpp) # SLOW
@@ -50,7 +56,8 @@ llama_build_and_test_executable(test-grad0.cpp)
 llama_build_and_test_executable(test-backend-ops.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
-llama_build_and_test_executable(test-model-load-cancel.cpp)
+
+llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
 
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)

From 4b63355f45da7e0e4b0e2396782fc003c8993c66 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Mon, 18 Dec 2023 04:23:58 -0500
Subject: [PATCH 11/27] ci : ctest uses -L main

---
 ci/run.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 2e33438312e..025cb6aa3a6 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,9 +22,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
 
-rm -v $OUT/*.log
-rm -v $OUT/*.exit
-rm -v $OUT/*.md
+rm -fv $OUT/*.log
+rm -fv $OUT/*.exit
+rm -fv $OUT/*.md
 
 sd=`dirname $0`
 cd $sd/../
@@ -84,7 +84,7 @@ function gg_run_ctest_debug {
     (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }
@@ -113,9 +113,9 @@ function gg_run_ctest_release {
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
-        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     fi
 
     set +e

From aed3cf838ccf916c96528d1d111782dd2e0ce9a2 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Mon, 18 Dec 2023 04:45:39 -0500
Subject: [PATCH 12/27] Attempt at writing ctest_with_model

---
 ci/run.sh | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ci/run.sh b/ci/run.sh
index 025cb6aa3a6..2fb21429948 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -131,6 +131,23 @@ function gg_sum_ctest_release {
     gg_printf '```\n'
 }
 
+function gg_run_ctest_with_model {
+    cd ${SRC}
+    set -e
+    (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log
+    set +e
+}
+
+function gg_sum_ctest_with_model {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest with model files\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)"
+    gg_printf '```\n'
+}
+
 # open_llama_3b_v2
 
 function gg_run_open_llama_3b_v2 {
@@ -508,6 +525,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
         else
             test $ret -eq 0 && gg_run open_llama_7b_v2
         fi
+        test $ret -eq 0 && gg_run ctest_with_model
     fi
 fi
 

From f80ff4dc6a545f84e0ea949b0b70c38ffa166c40 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 01:43:27 -0500
Subject: [PATCH 13/27] ci : get ci/run.sh working with test-model-load-cancel

---
 ci/run.sh                        | 245 +++++++++++++++++--------------
 requirements.txt                 |   4 +-
 tests/test-model-load-cancel.cpp |  36 +++--
 3 files changed, 164 insertions(+), 121 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 2fb21429948..a1b978a0f9f 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -11,6 +11,8 @@
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 
+set -u # Fail on unset variables
+
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
     exit 1
@@ -30,8 +32,20 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
+# Read-only array of quantization types for iteration.
+# Use ${quants[@]:1} to skip f16.
+declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k )
+
 ## helpers
 
+# Print an error message to stderr and exit with an error.
+# usage: die <format-string> <format-args>
+function die {
+    local format="$1"; shift
+    >&2 printf "$format" "$@"
+    exit 1
+}
+
 # download a file if it does not exist or if it is outdated
 function gg_wget {
     local out=$1
@@ -77,7 +91,9 @@ function gg_run {
 function gg_run_ctest_debug {
     cd ${SRC}
 
-    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+    rm -rf build-ci-debug
+    mkdir build-ci-debug
+    cd build-ci-debug
 
     set -e
 
@@ -105,14 +121,16 @@ function gg_sum_ctest_debug {
 function gg_run_ctest_release {
     cd ${SRC}
 
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    rm -rf build-ci-release
+    mkdir build-ci-release
+    cd build-ci-release
 
     set -e
 
     (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then
         (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
         (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@@ -133,6 +151,7 @@ function gg_sum_ctest_release {
 
 function gg_run_ctest_with_model {
     cd ${SRC}
+    cd build-ci-release
     set -e
     (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log
     set +e
@@ -151,81 +170,70 @@ function gg_sum_ctest_with_model {
 # open_llama_3b_v2
 
 function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
+    # We use absolute paths here to not have to track CWD as much
+    local models_mnt="$(realpath "${SRC}/models-mnt")"
+    local path_models="${models_mnt}/open-llama/3B-v2"
+    local path_wiki="${models_mnt}/wikitext"
+    local path_wiki_raw="${path_wiki}/wikitext-2-raw"
 
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+    mkdir -p "${path_models}" "${path_wiki}"
 
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
 
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    gg_wget "${path_wiki}"  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}"
+    head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw"
 
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    rm -rf "${SRC}/build-ci-release"
+    mkdir "${SRC}/build-ci-release"
+    cd "${SRC}/build-ci-release"
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log"
+    (time make -j                                              ) 2>&1 | tee -a "${OUT}/${ci}-make.log"
 
-    python3 ../convert.py ${path_models}
+    python3 "${SRC}/convert.py" "${path_models}"
 
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    # Get the model path for a quantization
+    # usage: model_for <quant>
+    function model_for {
+        if (( $# != 1 )); then
+            die 'model_for takes a single quantization, such as q8_0'
+        fi
+        echo -n "${path_models}/ggml-model-$1.gguf"
+    }
 
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+    wiki_test_60="${path_wiki_raw}/wiki.test-60.raw"
 
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+     # Quantize q8_0 through q6_k
+    for q in "${quants[@]:1}"; do
+        ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}"
+    done
 
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    # Run basic inference for all quants
+    for q in "${quants[@]}"; do
+        ( time \
+            ./bin/main --model "$(model_for "${q}")"  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is"
+        ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log"
+    done
 
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    # Run perplexity with wiki_test_60
+    for q in "${quants[@]}"; do
+        ( time \
+            ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2
+        ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    done
+
+    # Run examples/save-load-state with q4_0
+    ( time \
+        ./bin/save-load-state --model "$(model_for q4_0)"
+    ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log"
 
     function check_ppl {
         qnt="$1"
@@ -240,17 +248,11 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    # Check perplexity results for all quants
+    for q in "${quants[@]}"; do
+        check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log"  | grep "^\[1\]")" \
+            | tee -a "${OUT}/${ci}-ppl.log"
+    done
 
     # lora
     function compare_ppl {
@@ -267,32 +269,42 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
+    local path_lora="${path_models}/lora"
+    local path_shakespeare="${models_mnt}/shakespeare"
 
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+    local shakespeare="${path_shakespeare}/shakespeare.txt"
+    local lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
 
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
 
-    python3 ../convert-lora-to-ggml.py ${path_lora}
+    python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}"
 
     # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log"
+    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log"
+    compare_ppl "f16 shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log"
+    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log"
+    compare_ppl "q8_0 shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
+    ( time \
+        ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2
+    ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log"
+    compare_ppl "q8_0 / f16 base shakespeare" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
+        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \
+        | tee -a "${OUT}/${ci}-lora-ppl.log"
 
     set +e
 }
@@ -502,31 +514,42 @@ function gg_sum_open_llama_7b_v2 {
 
 ## main
 
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    rm -rf ${SRC}/models-mnt
+ret=0
 
-    mnt_models=${MNT}/models
-    mkdir -p ${mnt_models}
-    ln -sfn ${mnt_models} ${SRC}/models-mnt
+# This is necessary to test if a variable is set while `set -u` is enabled.
+# see: https://stackoverflow.com/a/13864829
+# [[ -z ${var+x} ]]   evaluates to false if var is set
+# [[ ! -z ${var+x} ]] evaluates to true  if var is set
+if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then
+    test "${ret}" -eq 0 && gg_run ctest_debug
+    test "${ret}" -eq 0 && gg_run ctest_release
+    exit "${ret}"
+fi # Otherwise, do extended testing
 
-    python3 -m pip install -r ${SRC}/requirements.txt
-    python3 -m pip install --editable gguf-py
-fi
+rm -rf ${SRC}/models-mnt
 
-ret=0
+mnt_models=${MNT}/models
+mkdir -p ${mnt_models}
+ln -sfn ${mnt_models} ${SRC}/models-mnt
+
+# Create a fresh python3 venv and enter it
+python3 -m venv "${MNT}/venv"
+source "${MNT}/venv/bin/activate"
+
+pip install --disable-pip-version-check -r ${SRC}/requirements.txt
+pip install --disable-pip-version-check --editable gguf-py
 
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
-        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-        test $ret -eq 0 && gg_run ctest_with_model
+# Run tests with open_llama
+if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then
+    if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then
+        test $ret -eq 0 && gg_run open_llama_7b_v2
+    else
+        test $ret -eq 0 && gg_run open_llama_3b_v2
     fi
+    test $ret -eq 0 && gg_run ctest_with_model
 fi
 
 exit $ret
diff --git a/requirements.txt b/requirements.txt
index badfec3be80..35713223f8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,6 @@
+gguf>=0.1.0
 numpy==1.24.4
+protobuf==4.25.1
 sentencepiece==0.1.98
+torch==2.0.1
 transformers>=4.34.0
-gguf>=0.1.0
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
index 926a305da86..509f3e8e031 100644
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@@ -2,27 +2,45 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <string>
 
 int main(void) {
-    const auto * model_path = "models/7B/ggml-model-f16.gguf";
-    auto * file = fopen(model_path, "r");
+    const char * models_to_try[] = {
+        // Same default as example/main for local use
+        "./models/7B/ggml-model-f16.gguf",
+        // Models for ./ci/run.sh
+        "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf",
+        "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf",
+    };
 
-    if (file == nullptr) {
-        fprintf(stderr, "no model at '%s' found\n", model_path);
-        return EXIT_FAILURE;
+    const char * chosen_model;
+    for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) {
+        const auto * model = models_to_try[i];
+
+        auto * file = fopen(model, "r");
+        if (file == nullptr) {
+            continue;
+        }
+
+        chosen_model = model;
+        fprintf(stderr, "using '%s'\n", model);
+        fclose(file);
     }
 
-    fprintf(stderr, "using '%s'\n", model_path);
-    fclose(file);
+    if (chosen_model == nullptr) {
+        fprintf(stderr, "no model found\n");
+        return EXIT_FAILURE;
+    }
 
     llama_backend_init(false);
     auto params = llama_model_params{};
     params.use_mmap = false;
     params.progress_callback = [](float progress, void * ctx){
         (void) ctx;
-        return progress > 0.50;
+        return progress > 0.05;
     };
-    auto * model = llama_load_model_from_file(model_path, params);
+
+    auto * model = llama_load_model_from_file(chosen_model, params);
     llama_backend_free();
     return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
 }

From 121b04d121ea8e52709226eaa2da16026f55abf4 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 02:19:11 -0500
Subject: [PATCH 14/27] ci : restrict .github/workflows/build.yml ctest to -L
 main

---
 .github/workflows/build.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a5090e398c1..1d87419401f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -72,7 +72,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -107,7 +107,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   ubuntu-latest-cmake-mpi:
     runs-on: ubuntu-latest
@@ -141,7 +141,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest -L main --verbose
 
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
@@ -202,7 +202,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose --timeout 900
+          ctest -L main --verbose --timeout 900
 
   macOS-latest-cmake-ios:
     runs-on: macos-latest
@@ -394,7 +394,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
         run: |
           cd build
-          ctest -C Release --verbose --timeout 900
+          ctest -L main -C Release --verbose --timeout 900
 
       - name: Test (Intel SDE)
         id: cmake_test_sde
@@ -406,7 +406,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
-          & $sde -future -- ctest -C Release --verbose --timeout 900
+          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
 
       - name: Determine tag name
         id: tag

From 1e796259101c9f501f7d1ee70641d0e62f1cd1c6 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 02:42:07 -0500
Subject: [PATCH 15/27] update requirements.txt

---
 requirements.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 35713223f8e..d9b430d52a3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-gguf>=0.1.0
 numpy==1.24.4
-protobuf==4.25.1
 sentencepiece==0.1.98
-torch==2.0.1
+torch>=2.0.0
 transformers>=4.34.0
+gguf>=0.1.0
+protobuf>=4.21.0

From 9809314bbf9215f0679238e01d0dfbe2bfee5b54 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 17:46:36 -0500
Subject: [PATCH 16/27] Disable test-model-load-cancel in make

---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 6c126269bdf..9a1c28fe598 100644
--- a/Makefile
+++ b/Makefile
@@ -9,8 +9,9 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops \
-	tests/test-model-load-cancel
+	tests/test-backend-ops
+#   # TODO(crasm): determine how to run tests that depend on openllama model files with make
+	# tests/test-model-load-cancel
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report

From 9a056ed708e06904f909d3550f937863c5ac2248 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Tue, 19 Dec 2023 20:56:22 -0500
Subject: [PATCH 17/27] Remove venv before creation

---
 ci/run.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/run.sh b/ci/run.sh
index a1b978a0f9f..9c2b4b3cf11 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -533,6 +533,7 @@ mkdir -p ${mnt_models}
 ln -sfn ${mnt_models} ${SRC}/models-mnt
 
 # Create a fresh python3 venv and enter it
+rm -rf "${MNT}/venv"
 python3 -m venv "${MNT}/venv"
 source "${MNT}/venv/bin/activate"
 

From 293d16fd40666c7afd1d86089e6c9980708c409e Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Wed, 20 Dec 2023 00:00:08 -0500
Subject: [PATCH 18/27] Restructure requirements.txt

Top-level now imports the specific additional requirements for each
python file. Using `pip install -r requirements.txt` will fail if
versions become mismatched in the per-file requirements.
---
 ...hf-to-gguf.txt => convert-hf-to-gguf_requirements.txt | 1 -
 convert-lora-to-ggml_requirements.txt                    | 1 +
 convert_requirements.txt                                 | 5 +++++
 requirements.txt                                         | 9 +++------
 4 files changed, 9 insertions(+), 7 deletions(-)
 rename requirements-hf-to-gguf.txt => convert-hf-to-gguf_requirements.txt (62%)
 create mode 100644 convert-lora-to-ggml_requirements.txt
 create mode 100644 convert_requirements.txt

diff --git a/requirements-hf-to-gguf.txt b/convert-hf-to-gguf_requirements.txt
similarity index 62%
rename from requirements-hf-to-gguf.txt
rename to convert-hf-to-gguf_requirements.txt
index f4600539e27..a54ca806752 100644
--- a/requirements-hf-to-gguf.txt
+++ b/convert-hf-to-gguf_requirements.txt
@@ -1,3 +1,2 @@
--r requirements.txt
 torch==2.1.1
 transformers==4.35.2
diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt
new file mode 100644
index 00000000000..8cf89d1906e
--- /dev/null
+++ b/convert-lora-to-ggml_requirements.txt
@@ -0,0 +1 @@
+torch==2.1.1
diff --git a/convert_requirements.txt b/convert_requirements.txt
new file mode 100644
index 00000000000..1a116256671
--- /dev/null
+++ b/convert_requirements.txt
@@ -0,0 +1,5 @@
+numpy==1.24.4
+sentencepiece==0.1.98
+transformers>=4.34.0
+gguf>=0.1.0
+protobuf>=4.21.0
diff --git a/requirements.txt b/requirements.txt
index d9b430d52a3..778a20cef6a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,3 @@
-numpy==1.24.4
-sentencepiece==0.1.98
-torch>=2.0.0
-transformers>=4.34.0
-gguf>=0.1.0
-protobuf>=4.21.0
+-r convert_requirements.txt
+-r convert-hf-to-gguf_requirements.txt
+-r convert-lora-to-ggml_requirements.txt

From a0eab1ea190692801fa2fe93d1795e480112757a Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Wed, 20 Dec 2023 00:10:31 -0500
Subject: [PATCH 19/27] Make per-python-script requirements work alone

This doesn't break the main requirements.txt.
---
 convert-hf-to-gguf_requirements.txt   | 1 +
 convert-lora-to-ggml_requirements.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/convert-hf-to-gguf_requirements.txt b/convert-hf-to-gguf_requirements.txt
index a54ca806752..d295025c978 100644
--- a/convert-hf-to-gguf_requirements.txt
+++ b/convert-hf-to-gguf_requirements.txt
@@ -1,2 +1,3 @@
+-r convert_requirements.txt
 torch==2.1.1
 transformers==4.35.2
diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt
index 8cf89d1906e..f9481c12801 100644
--- a/convert-lora-to-ggml_requirements.txt
+++ b/convert-lora-to-ggml_requirements.txt
@@ -1 +1,2 @@
+-r convert_requirements.txt
 torch==2.1.1

From ca122dc9e007e34c6ba5a8d4d89bb5f9d50b6d52 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Wed, 20 Dec 2023 00:14:56 -0500
Subject: [PATCH 20/27] Add comment

---
 requirements.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 778a20cef6a..0f0147cd850 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,8 @@
+# These requirements include all dependencies for all top-level python scripts
+# for llama.cpp. Avoid adding packages here directly.
+#
+# Package versions must stay compatible across all top-level python scripts.
+#
 -r convert_requirements.txt
 -r convert-hf-to-gguf_requirements.txt
 -r convert-lora-to-ggml_requirements.txt

From b853df4207bf763d672b0768ff83414971dca90a Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Wed, 20 Dec 2023 03:32:22 -0500
Subject: [PATCH 21/27] Add convert-persimmon-to-gguf.py to new
 requirements.txt scheme

---
 convert-persimmon-to-gguf.py               | 1 +
 convert-persimmon-to-gguf_requirements.txt | 2 ++
 requirements.txt                           | 3 +++
 3 files changed, 6 insertions(+)
 mode change 100644 => 100755 convert-persimmon-to-gguf.py
 create mode 100644 convert-persimmon-to-gguf_requirements.txt

diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
old mode 100644
new mode 100755
index 206b7d5ff9e..1ba5864dc25
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
diff --git a/convert-persimmon-to-gguf_requirements.txt b/convert-persimmon-to-gguf_requirements.txt
new file mode 100644
index 00000000000..f9481c12801
--- /dev/null
+++ b/convert-persimmon-to-gguf_requirements.txt
@@ -0,0 +1,2 @@
+-r convert_requirements.txt
+torch==2.1.1
diff --git a/requirements.txt b/requirements.txt
index 0f0147cd850..c946b5e4c6c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,9 @@
 #
 # Package versions must stay compatible across all top-level python scripts.
 #
+
 -r convert_requirements.txt
+
 -r convert-hf-to-gguf_requirements.txt
 -r convert-lora-to-ggml_requirements.txt
+-r convert-persimmon-to-gguf_requirements.txt

From c9a6de8f8aed0b96b6369f45e24d7920ed66807b Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 21 Dec 2023 04:16:41 -0500
Subject: [PATCH 22/27] Add check-requirements.sh script and GitHub workflow

---
 .../workflows/python-check-requirements.yml   |  31 ++++
 check-requirements.sh                         | 139 ++++++++++++++++++
 convert-lora-to-ggml_requirements.txt         |   2 -
 convert-persimmon-to-gguf_requirements.txt    |   2 -
 ...txt => requirements-convert-hf-to-gguf.txt |   2 +-
 requirements-convert-llama-ggml-to-gguf.txt   |   1 +
 requirements-convert-lora-to-ggml.txt         |   2 +
 requirements-convert-persimmon-to-gguf.txt    |   2 +
 ...quirements.txt => requirements-convert.txt |   0
 requirements.txt                              |   8 +-
 10 files changed, 180 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/python-check-requirements.yml
 create mode 100755 check-requirements.sh
 delete mode 100644 convert-lora-to-ggml_requirements.txt
 delete mode 100644 convert-persimmon-to-gguf_requirements.txt
 rename convert-hf-to-gguf_requirements.txt => requirements-convert-hf-to-gguf.txt (54%)
 create mode 100644 requirements-convert-llama-ggml-to-gguf.txt
 create mode 100644 requirements-convert-lora-to-ggml.txt
 create mode 100644 requirements-convert-persimmon-to-gguf.txt
 rename convert_requirements.txt => requirements-convert.txt (100%)

diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
new file mode 100644
index 00000000000..c7929a20cce
--- /dev/null
+++ b/.github/workflows/python-check-requirements.yml
@@ -0,0 +1,31 @@
+name: Python check requirements.txt
+
+on:
+  push:
+    paths:
+      - 'check-requirements.sh'
+      - 'convert*.py'
+      - 'requirements*.txt'
+  pull_request:
+    paths:
+      - 'check-requirements.sh'
+      - 'convert*.py'
+      - 'requirements*.txt'
+
+jobs:
+  python-check-requirements:
+    runs-on: ubuntu-latest
+    name: check-requirements
+    steps:
+      - name: Install shellcheck
+        run: |
+          sudo apt-get update
+          sudo apt-get install shellcheck
+      - name: Check out source repository
+        uses: actions/checkout@v3
+      - name: Set up Python environment
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+      - name: Run check-requirements.sh script
+        run:  bash check-requirements.sh
diff --git a/check-requirements.sh b/check-requirements.sh
new file mode 100755
index 00000000000..881b8f190fd
--- /dev/null
+++ b/check-requirements.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+#
+# check-requirements.sh checks all requirements files for each top-level
+# convert*.py script.
+#
+# WARNING: This is quite IO intensive, because a fresh venv is set up for every
+# python script.
+#
+# requires:
+# * bash >= 3.2.57
+# * shellcheck
+#
+# For each script, it creates a fresh venv, `pip install -r` the
+# requirements, and finally executes the python script with no arguments to
+# check for a `ModuleNotFoundError`.
+#
+
+log() {
+    local level="$1"; shift
+    local format="$1"; shift
+    # shellcheck disable=SC2059
+    >&2 printf "$level: $format\n" "$@"
+}
+
+info() {
+    log 'INFO' "$@"
+}
+
+fatal() {
+    log 'FATAL' "$@"
+    exit 1
+}
+
+cleanup() {
+    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
+        info "Removing $workdir"
+        (
+            count=0
+            rm -rfv "$workdir" | while read -r; do
+                if (( count++ > 750 )); then
+                    printf '.'
+                    count=0
+                fi
+            done
+            printf '\n'
+        )&
+        wait $!
+        info "Removed '$workdir'"
+    fi
+}
+
+abort() {
+    cleanup
+    exit 1
+}
+
+trap abort SIGINT SIGTERM SIGQUIT SIGABRT
+trap cleanup EXIT
+
+set -eu -o pipefail
+this="$(realpath "$0")"
+readonly this
+cd "$(dirname "$this")"
+
+shellcheck "$this"
+
+workdir=
+if [[ -n ${1+x} ]]; then
+    arg_dir="$(realpath "$1")"
+    if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then
+        fatal "$arg_dir is not a valid directory"
+    fi
+    workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")"
+else
+    workdir="$(mktemp -d "/tmp/check-requirements.XXXX")"
+fi
+readonly workdir
+
+info "Working directory: $workdir"
+
+assert_arg_count() {
+    local argcount="$1"; shift
+    if (( $# != argcount )); then
+        fatal "${FUNCNAME[1]}: incorrect number of args"
+    fi
+}
+
+check_requirements() {
+    assert_arg_count 2 "$@"
+    local venv="$1"
+    local reqs="$2"
+
+    info "$reqs: beginning check"
+    (
+        # shellcheck source=/dev/null
+        source "$venv/bin/activate"
+        pip --disable-pip-version-check install -q -r "$reqs"
+    )
+    info "$reqs: OK"
+}
+
+check_convert_script() {
+    assert_arg_count 1 "$@"
+    local py="$1"
+    local pyname="${py%.py}"
+
+    info "$py: beginning check"
+
+    local reqs="requirements-$pyname.txt"
+    local venv="$workdir/$pyname-venv"
+    python3 -m venv "$venv"
+
+    check_requirements "$venv" "$reqs"
+    set +e
+    (
+        # shellcheck source=/dev/null
+        source "$venv/bin/activate"
+        py_err="$workdir/$pyname.out"
+        python "$py" 2> "$py_err"
+        >&2 cat "$py_err"
+        grep -e 'ModuleNotFoundError' "$py_err"
+    )
+    set -e
+    # shellcheck disable=SC2181
+    (( $? )) && fatal "$py: some imports not declared in $reqs"
+    info "$py: imports OK"
+}
+
+# Check requirements.txt
+all_venv="$workdir/all-venv"
+python3 -m venv "$all_venv"
+check_requirements "$all_venv" 'requirements.txt'
+
+check_convert_script 'convert.py'
+for py in convert-*.py; do
+    check_convert_script "$py"
+done
+
+info "Done! No issues found."
diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt
deleted file mode 100644
index f9481c12801..00000000000
--- a/convert-lora-to-ggml_requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r convert_requirements.txt
-torch==2.1.1
diff --git a/convert-persimmon-to-gguf_requirements.txt b/convert-persimmon-to-gguf_requirements.txt
deleted file mode 100644
index f9481c12801..00000000000
--- a/convert-persimmon-to-gguf_requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r convert_requirements.txt
-torch==2.1.1
diff --git a/convert-hf-to-gguf_requirements.txt b/requirements-convert-hf-to-gguf.txt
similarity index 54%
rename from convert-hf-to-gguf_requirements.txt
rename to requirements-convert-hf-to-gguf.txt
index d295025c978..4d00b196661 100644
--- a/convert-hf-to-gguf_requirements.txt
+++ b/requirements-convert-hf-to-gguf.txt
@@ -1,3 +1,3 @@
--r convert_requirements.txt
+-r requirements-convert.txt
 torch==2.1.1
 transformers==4.35.2
diff --git a/requirements-convert-llama-ggml-to-gguf.txt b/requirements-convert-llama-ggml-to-gguf.txt
new file mode 100644
index 00000000000..8a5377762c1
--- /dev/null
+++ b/requirements-convert-llama-ggml-to-gguf.txt
@@ -0,0 +1 @@
+-r requirements-convert.txt
diff --git a/requirements-convert-lora-to-ggml.txt b/requirements-convert-lora-to-ggml.txt
new file mode 100644
index 00000000000..30827c8964d
--- /dev/null
+++ b/requirements-convert-lora-to-ggml.txt
@@ -0,0 +1,2 @@
+-r requirements-convert.txt
+torch==2.1.1
diff --git a/requirements-convert-persimmon-to-gguf.txt b/requirements-convert-persimmon-to-gguf.txt
new file mode 100644
index 00000000000..30827c8964d
--- /dev/null
+++ b/requirements-convert-persimmon-to-gguf.txt
@@ -0,0 +1,2 @@
+-r requirements-convert.txt
+torch==2.1.1
diff --git a/convert_requirements.txt b/requirements-convert.txt
similarity index 100%
rename from convert_requirements.txt
rename to requirements-convert.txt
diff --git a/requirements.txt b/requirements.txt
index c946b5e4c6c..da4f3f9a874 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,8 +4,8 @@
 # Package versions must stay compatible across all top-level python scripts.
 #
 
--r convert_requirements.txt
+-r requirements-convert.txt
 
--r convert-hf-to-gguf_requirements.txt
--r convert-lora-to-ggml_requirements.txt
--r convert-persimmon-to-gguf_requirements.txt
+-r requirements-convert-hf-to-gguf.txt
+-r requirements-convert-lora-to-ggml.txt
+-r requirements-convert-persimmon-to-gguf.txt

From e86b8cd93a5a979de12f18ad5fc73dbacf229448 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 21 Dec 2023 04:28:58 -0500
Subject: [PATCH 23/27] Remove shellcheck installation step from workflow

---
 .github/workflows/python-check-requirements.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index c7929a20cce..49c992b4ad0 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -17,10 +17,6 @@ jobs:
     runs-on: ubuntu-latest
     name: check-requirements
     steps:
-      - name: Install shellcheck
-        run: |
-          sudo apt-get update
-          sudo apt-get install shellcheck
       - name: Check out source repository
         uses: actions/checkout@v3
       - name: Set up Python environment

From bdfe4ba85c72df7fb521f7222a9dca017177d734 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 21 Dec 2023 04:55:28 -0500
Subject: [PATCH 24/27] Add nocleanup special arg

---
 .../workflows/python-check-requirements.yml   |  2 +-
 check-requirements.sh                         | 25 ++++++++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 49c992b4ad0..cc97ee81005 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -24,4 +24,4 @@ jobs:
         with:
           python-version: "3.11"
       - name: Run check-requirements.sh script
-        run:  bash check-requirements.sh
+        run:  bash check-requirements.sh nocleanup
diff --git a/check-requirements.sh b/check-requirements.sh
index 881b8f190fd..ac6beb60492 100755
--- a/check-requirements.sh
+++ b/check-requirements.sh
@@ -6,9 +6,18 @@
 # WARNING: This is quite IO intensive, because a fresh venv is set up for every
 # python script.
 #
+# usage:    ./check-requirements.sh [<working_dir>]
+#           ./check-requirements.sh 'nocleanup' [<working_dir>]
+#
+# where:
+#           - <working_dir> is a directory that can be used as the base for
+#               setting up the venvs. Defaults to `/tmp`.
+#           - 'nocleanup' as the first argument will disable automatic cleanup
+#               of the files created by this script.
+#
 # requires:
-# * bash >= 3.2.57
-# * shellcheck
+#           - bash >= 3.2.57
+#           - shellcheck
 #
 # For each script, it creates a fresh venv, `pip install -r` the
 # requirements, and finally executes the python script with no arguments to
@@ -54,8 +63,12 @@ abort() {
     exit 1
 }
 
-trap abort SIGINT SIGTERM SIGQUIT SIGABRT
-trap cleanup EXIT
+if [[ $1 == nocleanup ]]; then
+    shift # discard nocleanup arg
+else
+    trap abort SIGINT SIGTERM SIGQUIT SIGABRT
+    trap cleanup EXIT
+fi
 
 set -eu -o pipefail
 this="$(realpath "$0")"
@@ -107,6 +120,10 @@ check_convert_script() {
     info "$py: beginning check"
 
     local reqs="requirements-$pyname.txt"
+    if [[ ! -r "$reqs" ]]; then
+        fatal "$py missing requirements. Expected: $reqs"
+    fi
+
     local venv="$workdir/$pyname-venv"
     python3 -m venv "$venv"
 

From e4382571ca3ca4a0ef97f7b3f26bd19d579891c8 Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Thu, 21 Dec 2023 18:54:27 -0500
Subject: [PATCH 25/27] Fix merge

see: https://github.com/ggerganov/llama.cpp/pull/4462#discussion_r1434593573
---
 llama.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c64a1fa0b6f..cb0546c952d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2469,7 +2469,9 @@ struct llama_model_loader {
         }
 
         if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
+            // Even though the model is done loading, we still honor
+            // cancellation since we need to free allocations.
+            return progress_callback(1.0f, progress_callback_user_data);
         }
         return true;
     }
@@ -3060,8 +3062,6 @@ static bool llm_load_tensors(
         void * progress_callback_user_data) {
     model.t_start_us = ggml_time_us();
 
-    bool ok = true; // if false, model load was cancelled
-
     auto & ctx     = model.ctx;
     auto & hparams = model.hparams;
 
@@ -3729,11 +3729,8 @@ static bool llm_load_tensors(
         model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
     }
 
-    ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
-    if (progress_callback) {
-        // Even though the model is done loading, we still honor
-        // cancellation since we need to free allocations.
-        ok = ok && progress_callback(1.0f, progress_callback_user_data);
+    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
+        return false;
     }
 
     model.mapping = std::move(ml.mapping);
@@ -3741,7 +3738,7 @@ static bool llm_load_tensors(
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = ggml_time_us() - model.t_start_us;
-    return ok;
+    return true;
 }
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback

From f607e5325214a2c10f8db772061f521f4e7ac7ee Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Fri, 22 Dec 2023 00:58:32 -0500
Subject: [PATCH 26/27] reset to upstream/master

---
 .github/workflows/build.yml                   |  12 +-
 .../workflows/python-check-requirements.yml   |  27 --
 .gitignore                                    |  16 +
 Makefile                                      |   4 -
 check-requirements.sh                         | 156 ----------
 ci/run.sh                                     | 274 ++++++++----------
 convert-persimmon-to-gguf.py                  |   1 -
 llama.cpp                                     |  46 +--
 llama.h                                       |   6 +-
 requirements-convert-llama-ggml-to-gguf.txt   |   1 -
 requirements-convert-lora-to-ggml.txt         |   2 -
 requirements-convert-persimmon-to-gguf.txt    |   2 -
 requirements-convert.txt                      |   5 -
 ...to-gguf.txt => requirements-hf-to-gguf.txt |   2 +-
 requirements.txt                              |  16 +-
 tests/.gitignore                              |   2 -
 tests/CMakeLists.txt                          |   8 -
 tests/test-model-load-cancel.cpp              |  46 ---
 18 files changed, 159 insertions(+), 467 deletions(-)
 delete mode 100644 .github/workflows/python-check-requirements.yml
 delete mode 100755 check-requirements.sh
 mode change 100755 => 100644 convert-persimmon-to-gguf.py
 delete mode 100644 requirements-convert-llama-ggml-to-gguf.txt
 delete mode 100644 requirements-convert-lora-to-ggml.txt
 delete mode 100644 requirements-convert-persimmon-to-gguf.txt
 delete mode 100644 requirements-convert.txt
 rename requirements-convert-hf-to-gguf.txt => requirements-hf-to-gguf.txt (54%)
 delete mode 100644 tests/.gitignore
 delete mode 100644 tests/test-model-load-cancel.cpp

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1d87419401f..a5090e398c1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -72,7 +72,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest -L main --verbose --timeout 900
+          ctest --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
@@ -107,7 +107,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest -L main --verbose --timeout 900
+          ctest --verbose --timeout 900
 
   ubuntu-latest-cmake-mpi:
     runs-on: ubuntu-latest
@@ -141,7 +141,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest -L main --verbose
+          ctest --verbose
 
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
@@ -202,7 +202,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest -L main --verbose --timeout 900
+          ctest --verbose --timeout 900
 
   macOS-latest-cmake-ios:
     runs-on: macos-latest
@@ -394,7 +394,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
         run: |
           cd build
-          ctest -L main -C Release --verbose --timeout 900
+          ctest -C Release --verbose --timeout 900
 
       - name: Test (Intel SDE)
         id: cmake_test_sde
@@ -406,7 +406,7 @@ jobs:
           7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
           $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
           cd build
-          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
+          & $sde -future -- ctest -C Release --verbose --timeout 900
 
       - name: Determine tag name
         id: tag
diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
deleted file mode 100644
index cc97ee81005..00000000000
--- a/.github/workflows/python-check-requirements.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - 'check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements*.txt'
-  pull_request:
-    paths:
-      - 'check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements*.txt'
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash check-requirements.sh nocleanup
diff --git a/.gitignore b/.gitignore
index 7b1a9f9e320..76b3d286182 100644
--- a/.gitignore
+++ b/.gitignore
@@ -86,3 +86,19 @@ examples/jeopardy/results.txt
 
 poetry.lock
 poetry.toml
+
+# Test binaries
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-sampling
+/tests/test-tokenizer-0-llama
+/tests/test-tokenizer-0-falcon
+/tests/test-tokenizer-1-llama
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
+/tests/test-backend-ops
diff --git a/Makefile b/Makefile
index b5ce2e2dae6..68df7702aa9 100644
--- a/Makefile
+++ b/Makefile
@@ -10,8 +10,6 @@ TEST_TARGETS = \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
 	tests/test-backend-ops
-#   # TODO(crasm): determine how to run tests that depend on openllama model files with make
-	# tests/test-model-load-cancel
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -732,5 +730,3 @@ tests/test-c.o: tests/test-c.c llama.h
 
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS)
diff --git a/check-requirements.sh b/check-requirements.sh
deleted file mode 100755
index ac6beb60492..00000000000
--- a/check-requirements.sh
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/bin/bash
-#
-# check-requirements.sh checks all requirements files for each top-level
-# convert*.py script.
-#
-# WARNING: This is quite IO intensive, because a fresh venv is set up for every
-# python script.
-#
-# usage:    ./check-requirements.sh [<working_dir>]
-#           ./check-requirements.sh 'nocleanup' [<working_dir>]
-#
-# where:
-#           - <working_dir> is a directory that can be used as the base for
-#               setting up the venvs. Defaults to `/tmp`.
-#           - 'nocleanup' as the first argument will disable automatic cleanup
-#               of the files created by this script.
-#
-# requires:
-#           - bash >= 3.2.57
-#           - shellcheck
-#
-# For each script, it creates a fresh venv, `pip install -r` the
-# requirements, and finally executes the python script with no arguments to
-# check for a `ModuleNotFoundError`.
-#
-
-log() {
-    local level="$1"; shift
-    local format="$1"; shift
-    # shellcheck disable=SC2059
-    >&2 printf "$level: $format\n" "$@"
-}
-
-info() {
-    log 'INFO' "$@"
-}
-
-fatal() {
-    log 'FATAL' "$@"
-    exit 1
-}
-
-cleanup() {
-    if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then
-        info "Removing $workdir"
-        (
-            count=0
-            rm -rfv "$workdir" | while read -r; do
-                if (( count++ > 750 )); then
-                    printf '.'
-                    count=0
-                fi
-            done
-            printf '\n'
-        )&
-        wait $!
-        info "Removed '$workdir'"
-    fi
-}
-
-abort() {
-    cleanup
-    exit 1
-}
-
-if [[ $1 == nocleanup ]]; then
-    shift # discard nocleanup arg
-else
-    trap abort SIGINT SIGTERM SIGQUIT SIGABRT
-    trap cleanup EXIT
-fi
-
-set -eu -o pipefail
-this="$(realpath "$0")"
-readonly this
-cd "$(dirname "$this")"
-
-shellcheck "$this"
-
-workdir=
-if [[ -n ${1+x} ]]; then
-    arg_dir="$(realpath "$1")"
-    if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then
-        fatal "$arg_dir is not a valid directory"
-    fi
-    workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")"
-else
-    workdir="$(mktemp -d "/tmp/check-requirements.XXXX")"
-fi
-readonly workdir
-
-info "Working directory: $workdir"
-
-assert_arg_count() {
-    local argcount="$1"; shift
-    if (( $# != argcount )); then
-        fatal "${FUNCNAME[1]}: incorrect number of args"
-    fi
-}
-
-check_requirements() {
-    assert_arg_count 2 "$@"
-    local venv="$1"
-    local reqs="$2"
-
-    info "$reqs: beginning check"
-    (
-        # shellcheck source=/dev/null
-        source "$venv/bin/activate"
-        pip --disable-pip-version-check install -q -r "$reqs"
-    )
-    info "$reqs: OK"
-}
-
-check_convert_script() {
-    assert_arg_count 1 "$@"
-    local py="$1"
-    local pyname="${py%.py}"
-
-    info "$py: beginning check"
-
-    local reqs="requirements-$pyname.txt"
-    if [[ ! -r "$reqs" ]]; then
-        fatal "$py missing requirements. Expected: $reqs"
-    fi
-
-    local venv="$workdir/$pyname-venv"
-    python3 -m venv "$venv"
-
-    check_requirements "$venv" "$reqs"
-    set +e
-    (
-        # shellcheck source=/dev/null
-        source "$venv/bin/activate"
-        py_err="$workdir/$pyname.out"
-        python "$py" 2> "$py_err"
-        >&2 cat "$py_err"
-        grep -e 'ModuleNotFoundError' "$py_err"
-    )
-    set -e
-    # shellcheck disable=SC2181
-    (( $? )) && fatal "$py: some imports not declared in $reqs"
-    info "$py: imports OK"
-}
-
-# Check requirements.txt
-all_venv="$workdir/all-venv"
-python3 -m venv "$all_venv"
-check_requirements "$all_venv" 'requirements.txt'
-
-check_convert_script 'convert.py'
-for py in convert-*.py; do
-    check_convert_script "$py"
-done
-
-info "Done! No issues found."
diff --git a/ci/run.sh b/ci/run.sh
index 9c2b4b3cf11..2e33438312e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#/bin/bash
 #
 # sample usage:
 #
@@ -11,8 +11,6 @@
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 
-set -u # Fail on unset variables
-
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
     exit 1
@@ -24,28 +22,16 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")
 
-rm -fv $OUT/*.log
-rm -fv $OUT/*.exit
-rm -fv $OUT/*.md
+rm -v $OUT/*.log
+rm -v $OUT/*.exit
+rm -v $OUT/*.md
 
 sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
-# Read-only array of quantization types for iteration.
-# Use ${quants[@]:1} to skip f16.
-declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k )
-
 ## helpers
 
-# Print an error message to stderr and exit with an error.
-# usage: die <format-string> <format-args>
-function die {
-    local format="$1"; shift
-    >&2 printf "$format" "$@"
-    exit 1
-}
-
 # download a file if it does not exist or if it is outdated
 function gg_wget {
     local out=$1
@@ -91,16 +77,14 @@ function gg_run {
 function gg_run_ctest_debug {
     cd ${SRC}
 
-    rm -rf build-ci-debug
-    mkdir build-ci-debug
-    cd build-ci-debug
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
 
     set -e
 
     (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
     set +e
 }
@@ -121,19 +105,17 @@ function gg_sum_ctest_debug {
 function gg_run_ctest_release {
     cd ${SRC}
 
-    rm -rf build-ci-release
-    mkdir build-ci-release
-    cd build-ci-release
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
 
     set -e
 
     (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then
-        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
-        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     fi
 
     set +e
@@ -149,91 +131,84 @@ function gg_sum_ctest_release {
     gg_printf '```\n'
 }
 
-function gg_run_ctest_with_model {
-    cd ${SRC}
-    cd build-ci-release
-    set -e
-    (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log
-    set +e
-}
-
-function gg_sum_ctest_with_model {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest with model files\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)"
-    gg_printf '```\n'
-}
-
 # open_llama_3b_v2
 
 function gg_run_open_llama_3b_v2 {
-    # We use absolute paths here to not have to track CWD as much
-    local models_mnt="$(realpath "${SRC}/models-mnt")"
-    local path_models="${models_mnt}/open-llama/3B-v2"
-    local path_wiki="${models_mnt}/wikitext"
-    local path_wiki_raw="${path_wiki}/wikitext-2-raw"
+    cd ${SRC}
 
-    mkdir -p "${path_models}" "${path_wiki}"
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
 
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
 
-    gg_wget "${path_wiki}"  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}"
-    head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw"
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
 
-    rm -rf "${SRC}/build-ci-release"
-    mkdir "${SRC}/build-ci-release"
-    cd "${SRC}/build-ci-release"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log"
-    (time make -j                                              ) 2>&1 | tee -a "${OUT}/${ci}-make.log"
-
-    python3 "${SRC}/convert.py" "${path_models}"
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    # Get the model path for a quantization
-    # usage: model_for <quant>
-    function model_for {
-        if (( $# != 1 )); then
-            die 'model_for takes a single quantization, such as q8_0'
-        fi
-        echo -n "${path_models}/ggml-model-$1.gguf"
-    }
+    python3 ../convert.py ${path_models}
 
-    wiki_test_60="${path_wiki_raw}/wiki.test-60.raw"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
 
-     # Quantize q8_0 through q6_k
-    for q in "${quants[@]:1}"; do
-        ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}"
-    done
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
 
-    # Run basic inference for all quants
-    for q in "${quants[@]}"; do
-        ( time \
-            ./bin/main --model "$(model_for "${q}")"  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is"
-        ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log"
-    done
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
 
-    # Run perplexity with wiki_test_60
-    for q in "${quants[@]}"; do
-        ( time \
-            ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2
-        ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    done
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
-    # Run examples/save-load-state with q4_0
-    ( time \
-        ./bin/save-load-state --model "$(model_for q4_0)"
-    ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log"
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
@@ -248,11 +223,17 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    # Check perplexity results for all quants
-    for q in "${quants[@]}"; do
-        check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log"  | grep "^\[1\]")" \
-            | tee -a "${OUT}/${ci}-ppl.log"
-    done
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
     # lora
     function compare_ppl {
@@ -269,42 +250,32 @@ function gg_run_open_llama_3b_v2 {
         return 0
     }
 
-    local path_lora="${path_models}/lora"
-    local path_shakespeare="${models_mnt}/shakespeare"
+    path_lora="../models-mnt/open-llama/3B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
 
-    local shakespeare="${path_shakespeare}/shakespeare.txt"
-    local lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
 
-    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
 
-    python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}"
+    python3 ../convert-lora-to-ggml.py ${path_lora}
 
     # f16
-    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log"
-    (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log"
-    compare_ppl "f16 shakespeare" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \
-        | tee -a "${OUT}/${ci}-lora-ppl.log"
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
     # q8_0
-    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}"                              -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log"
-    (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log"
-    compare_ppl "q8_0 shakespeare" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \
-        | tee -a "${OUT}/${ci}-lora-ppl.log"
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
     # q8_0 + f16 lora-base
-    ( time \
-        ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2
-    ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log"
-    compare_ppl "q8_0 / f16 base shakespeare" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \
-        "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \
-        | tee -a "${OUT}/${ci}-lora-ppl.log"
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
 
     set +e
 }
@@ -514,43 +485,30 @@ function gg_sum_open_llama_7b_v2 {
 
 ## main
 
-ret=0
-
-# This is necessary to test if a variable is set while `set -u` is enabled.
-# see: https://stackoverflow.com/a/13864829
-# [[ -z ${var+x} ]]   evaluates to false if var is set
-# [[ ! -z ${var+x} ]] evaluates to true  if var is set
-if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then
-    test "${ret}" -eq 0 && gg_run ctest_debug
-    test "${ret}" -eq 0 && gg_run ctest_release
-    exit "${ret}"
-fi # Otherwise, do extended testing
-
-rm -rf ${SRC}/models-mnt
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    rm -rf ${SRC}/models-mnt
 
-mnt_models=${MNT}/models
-mkdir -p ${mnt_models}
-ln -sfn ${mnt_models} ${SRC}/models-mnt
+    mnt_models=${MNT}/models
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
 
-# Create a fresh python3 venv and enter it
-rm -rf "${MNT}/venv"
-python3 -m venv "${MNT}/venv"
-source "${MNT}/venv/bin/activate"
+    python3 -m pip install -r ${SRC}/requirements.txt
+    python3 -m pip install --editable gguf-py
+fi
 
-pip install --disable-pip-version-check -r ${SRC}/requirements.txt
-pip install --disable-pip-version-check --editable gguf-py
+ret=0
 
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 
-# Run tests with open_llama
-if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then
-    if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then
-        test $ret -eq 0 && gg_run open_llama_7b_v2
-    else
-        test $ret -eq 0 && gg_run open_llama_3b_v2
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
+        if [ -z ${GG_BUILD_CUDA} ]; then
+            test $ret -eq 0 && gg_run open_llama_3b_v2
+        else
+            test $ret -eq 0 && gg_run open_llama_7b_v2
+        fi
     fi
-    test $ret -eq 0 && gg_run ctest_with_model
 fi
 
 exit $ret
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
old mode 100755
new mode 100644
index 1ba5864dc25..206b7d5ff9e
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
diff --git a/llama.cpp b/llama.cpp
index cb0546c952d..d6c192441fb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2372,8 +2372,7 @@ struct llama_model_loader {
         }
     }
 
-    // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
+    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
         size_t size_data = 0;
 
         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@@ -2405,9 +2404,7 @@ struct llama_model_loader {
             GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
 
             if (progress_callback) {
-                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
-                    return false;
-                }
+                progress_callback((float) size_done / size_data, progress_callback_user_data);
             }
 
             const size_t offs = file_offset(ggml_get_name(cur));
@@ -2469,11 +2466,8 @@ struct llama_model_loader {
         }
 
         if (progress_callback) {
-            // Even though the model is done loading, we still honor
-            // cancellation since we need to free allocations.
-            return progress_callback(1.0f, progress_callback_user_data);
+            progress_callback(1.0f, progress_callback_user_data);
         }
-        return true;
     }
 };
 
@@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
-// Returns false if cancelled by progress_callback
-static bool llm_load_tensors(
+static void llm_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
@@ -3729,20 +3722,16 @@ static bool llm_load_tensors(
         model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
     }
 
-    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
-        return false;
-    }
+    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
 
     model.mapping = std::move(ml.mapping);
 
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = ggml_time_us() - model.t_start_us;
-    return true;
 }
 
-// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
-static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
@@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
 
         if (params.vocab_only) {
             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return 0;
+            return true;
         }
 
-        if (!llm_load_tensors(
+        llm_load_tensors(
             ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
-        )) {
-            return -2;
-        }
+        );
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return -1;
+        return false;
     }
 
-    return 0;
+    return true;
 }
 
 //
@@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file(
                     LLAMA_LOG_INFO("\n");
                 }
             }
-            return true;
         };
     }
 
-    int status = llama_model_load(path_model, *model, params);
-    GGML_ASSERT(status <= 0);
-    if (status < 0) {
-        if (status == -1) {
-            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
-        } else if (status == -2) {
-            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
-        }
+    if (!llama_model_load(path_model, *model, params)) {
+        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
         delete model;
         return nullptr;
     }
diff --git a/llama.h b/llama.h
index af76bae2d2a..0be4b1337b9 100644
--- a/llama.h
+++ b/llama.h
@@ -127,7 +127,7 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);
 
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences
@@ -180,9 +180,7 @@ extern "C" {
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
+        // called with a progress value between 0 and 1, pass NULL to disable
         llama_progress_callback progress_callback;
 
         // context pointer passed to the progress callback
diff --git a/requirements-convert-llama-ggml-to-gguf.txt b/requirements-convert-llama-ggml-to-gguf.txt
deleted file mode 100644
index 8a5377762c1..00000000000
--- a/requirements-convert-llama-ggml-to-gguf.txt
+++ /dev/null
@@ -1 +0,0 @@
--r requirements-convert.txt
diff --git a/requirements-convert-lora-to-ggml.txt b/requirements-convert-lora-to-ggml.txt
deleted file mode 100644
index 30827c8964d..00000000000
--- a/requirements-convert-lora-to-ggml.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r requirements-convert.txt
-torch==2.1.1
diff --git a/requirements-convert-persimmon-to-gguf.txt b/requirements-convert-persimmon-to-gguf.txt
deleted file mode 100644
index 30827c8964d..00000000000
--- a/requirements-convert-persimmon-to-gguf.txt
+++ /dev/null
@@ -1,2 +0,0 @@
--r requirements-convert.txt
-torch==2.1.1
diff --git a/requirements-convert.txt b/requirements-convert.txt
deleted file mode 100644
index 1a116256671..00000000000
--- a/requirements-convert.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-numpy==1.24.4
-sentencepiece==0.1.98
-transformers>=4.34.0
-gguf>=0.1.0
-protobuf>=4.21.0
diff --git a/requirements-convert-hf-to-gguf.txt b/requirements-hf-to-gguf.txt
similarity index 54%
rename from requirements-convert-hf-to-gguf.txt
rename to requirements-hf-to-gguf.txt
index 4d00b196661..f4600539e27 100644
--- a/requirements-convert-hf-to-gguf.txt
+++ b/requirements-hf-to-gguf.txt
@@ -1,3 +1,3 @@
--r requirements-convert.txt
+-r requirements.txt
 torch==2.1.1
 transformers==4.35.2
diff --git a/requirements.txt b/requirements.txt
index da4f3f9a874..1a116256671 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,5 @@
-# These requirements include all dependencies for all top-level python scripts
-# for llama.cpp. Avoid adding packages here directly.
-#
-# Package versions must stay compatible across all top-level python scripts.
-#
-
--r requirements-convert.txt
-
--r requirements-convert-hf-to-gguf.txt
--r requirements-convert-lora-to-ggml.txt
--r requirements-convert-persimmon-to-gguf.txt
+numpy==1.24.4
+sentencepiece==0.1.98
+transformers>=4.34.0
+gguf>=0.1.0
+protobuf>=4.21.0
diff --git a/tests/.gitignore b/tests/.gitignore
deleted file mode 100644
index 59be43b9994..00000000000
--- a/tests/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*
-!*.*
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 81a02dae92d..e42237c7a2e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,20 +8,14 @@ endfunction()
 function(llama_test_executable name source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-    set_property(TEST ${name} PROPERTY LABELS "main")
 endfunction()
 
 function(llama_build_and_test_executable source)
-    llama_build_and_test_executable_with_label(${source} "main")
-endfunction()
-
-function(llama_build_and_test_executable_with_label source label)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE llama common)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
 endfunction()
 
 # llama_build_and_test_executable(test-double-float.cpp) # SLOW
@@ -57,8 +51,6 @@ llama_build_and_test_executable(test-backend-ops.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
 
-llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
-
 # dummy executable - not installed
 get_filename_component(TEST_TARGET test-c.c NAME_WE)
 add_executable(${TEST_TARGET} test-c.c)
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
deleted file mode 100644
index 509f3e8e031..00000000000
--- a/tests/test-model-load-cancel.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "llama.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-
-int main(void) {
-    const char * models_to_try[] = {
-        // Same default as example/main for local use
-        "./models/7B/ggml-model-f16.gguf",
-        // Models for ./ci/run.sh
-        "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf",
-        "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf",
-    };
-
-    const char * chosen_model;
-    for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) {
-        const auto * model = models_to_try[i];
-
-        auto * file = fopen(model, "r");
-        if (file == nullptr) {
-            continue;
-        }
-
-        chosen_model = model;
-        fprintf(stderr, "using '%s'\n", model);
-        fclose(file);
-    }
-
-    if (chosen_model == nullptr) {
-        fprintf(stderr, "no model found\n");
-        return EXIT_FAILURE;
-    }
-
-    llama_backend_init(false);
-    auto params = llama_model_params{};
-    params.use_mmap = false;
-    params.progress_callback = [](float progress, void * ctx){
-        (void) ctx;
-        return progress > 0.05;
-    };
-
-    auto * model = llama_load_model_from_file(chosen_model, params);
-    llama_backend_free();
-    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
-}

From 5f2ee1c938d121f78b3b5dc8230511118efaeb4e Mon Sep 17 00:00:00 2001
From: crasm <crasm@git.vczf.us>
Date: Fri, 22 Dec 2023 01:00:11 -0500
Subject: [PATCH 27/27] Redo changes for cancelling model load

---
 llama.cpp | 46 +++++++++++++++++++++++++++++++++-------------
 llama.h   |  6 ++++--
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d6c192441fb..cb0546c952d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2372,7 +2372,8 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
+    // Returns false if cancelled by progress_callback
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
         size_t size_data = 0;
 
         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@@ -2404,7 +2405,9 @@ struct llama_model_loader {
             GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
 
             if (progress_callback) {
-                progress_callback((float) size_done / size_data, progress_callback_user_data);
+                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
+                    return false;
+                }
             }
 
             const size_t offs = file_offset(ggml_get_name(cur));
@@ -2466,8 +2469,11 @@ struct llama_model_loader {
         }
 
         if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
+            // Even though the model is done loading, we still honor
+            // cancellation since we need to free allocations.
+            return progress_callback(1.0f, progress_callback_user_data);
         }
+        return true;
     }
 };
 
@@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
 
-static void llm_load_tensors(
+// Returns false if cancelled by progress_callback
+static bool llm_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
@@ -3722,16 +3729,20 @@ static void llm_load_tensors(
         model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
     }
 
-    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
+    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
+        return false;
+    }
 
     model.mapping = std::move(ml.mapping);
 
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = ggml_time_us() - model.t_start_us;
+    return true;
 }
 
-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
+static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
     try {
         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
 
@@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
 
         if (params.vocab_only) {
             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return true;
+            return 0;
         }
 
-        llm_load_tensors(
+        if (!llm_load_tensors(
             ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
-        );
+        )) {
+            return -2;
+        }
     } catch (const std::exception & err) {
         LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return false;
+        return -1;
     }
 
-    return true;
+    return 0;
 }
 
 //
@@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
                     LLAMA_LOG_INFO("\n");
                 }
             }
+            return true;
         };
     }
 
-    if (!llama_model_load(path_model, *model, params)) {
-        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+    int status = llama_model_load(path_model, *model, params);
+    GGML_ASSERT(status <= 0);
+    if (status < 0) {
+        if (status == -1) {
+            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
+        } else if (status == -2) {
+            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
+        }
         delete model;
         return nullptr;
     }
diff --git a/llama.h b/llama.h
index 0be4b1337b9..af76bae2d2a 100644
--- a/llama.h
+++ b/llama.h
@@ -127,7 +127,7 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void *ctx);
 
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences
@@ -180,7 +180,9 @@ extern "C" {
         int32_t main_gpu;     // the GPU that is used for scratch and small tensors
         const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
 
-        // called with a progress value between 0 and 1, pass NULL to disable
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
         llama_progress_callback progress_callback;
 
         // context pointer passed to the progress callback