From 9abe2e44d1cdfe6d87ad99ed47e91684895a6e81 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 14 Dec 2023 04:03:25 -0500 Subject: [PATCH 01/27] llama : Add ability to cancel model load Updated llama_progress_callback so that if it returns false, the model loading is aborted. --- llama.cpp | 45 ++++++++++++++++++++++++++++++++------------- llama.h | 6 ++++-- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index 0e5ab044cdf..91cd929d178 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2297,7 +2297,8 @@ struct llama_model_loader { } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { + // Returns false if cancelled by progress_callback + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { size_t size_data = 0; size_t size_lock = 0; size_t size_pref = 0; // prefetch @@ -2323,7 +2324,9 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - progress_callback((float) done_size / size_data, progress_callback_user_data); + if (!progress_callback((float) done_size / size_data, progress_callback_user_data)) { + return false; + } } // allocate temp buffer if not using mmap @@ -2371,6 +2374,7 @@ struct llama_model_loader { done_size += ggml_nbytes(cur); } + return true; } }; @@ -2937,7 +2941,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static void llm_load_tensors( +// Returns false if cancelled by progress_callback +static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -2948,6 +2953,8 @@ static void llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); + bool ok = true; // if false, model load was cancelled + auto & ctx = model.ctx; auto & hparams = model.hparams; @@ -3678,10 +3685,11 @@ static void llm_load_tensors( } #endif - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); - + ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + ok = ok && progress_callback(1.0f, progress_callback_user_data); } model.mapping = std::move(ml.mapping); @@ -3689,9 +3697,11 @@ static void llm_load_tensors( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + return ok; } -static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +// Returns -1 on error, -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3712,16 +3722,18 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con return true; } - llm_load_tensors( + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - ); + )) { + return -2; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; + return -1; } - return true; + return 0; } // @@ -9017,11 +9029,18 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } + return true; }; } - if (!llama_model_load(path_model, *model, params)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + int status = llama_model_load(path_model, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s, cancelled model load\n", __func__); + } delete model; return nullptr; } diff --git a/llama.h b/llama.h index 45a65cacb7b..18c349d7b11 100644 --- a/llama.h +++ b/llama.h @@ -126,7 +126,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -179,7 +179,9 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // called with a progress value between 0 and 1, pass NULL to disable + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback; // context pointer passed to the progress callback From 3425e627450263e873a9490632c5d060571af0c4 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 14 Dec 2023 04:47:54 -0500 Subject: [PATCH 02/27] llama : Add test for model load cancellation --- tests/CMakeLists.txt | 1 + tests/test-model-load-cancel.cpp | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 tests/test-model-load-cancel.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e42237c7a2e..e854d27d952 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -50,6 +50,7 @@ llama_build_and_test_executable(test-grad0.cpp) llama_build_and_test_executable(test-backend-ops.cpp) llama_build_and_test_executable(test-rope.cpp) +llama_build_and_test_executable(test-model-load-cancel.cpp) # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp new file mode 100644 index 00000000000..8da21af81ff --- /dev/null +++ b/tests/test-model-load-cancel.cpp @@ -0,0 +1,17 @@ +#include "llama.h" + +#include +#include + +int main(void) { + llama_backend_init(false); + auto params = llama_model_params{}; + params.use_mmap = false; + params.progress_callback = [](float progress, void * ctx){ + std::ignore = ctx; + return progress > 0.50; + }; + auto * model = llama_load_model_from_file("../models/7B/ggml-model-f16.gguf", params); + llama_backend_free(); + return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; +} From 4b1f70cb03a23fc32cc6cf5492a1c5dc86b419a9 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 14 Dec 2023 16:29:05 -0500 Subject: [PATCH 03/27] Fix bool return in llama_model_load, remove std::ignore use --- llama.cpp | 4 ++-- tests/test-model-load-cancel.cpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 91cd929d178..3dbbe0a8059 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3700,7 +3700,7 @@ static bool llm_load_tensors( return ok; } -// Returns -1 on error, -2 on cancellation via llama_progress_callback +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3719,7 +3719,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return true; + return 0; } if (!llm_load_tensors( diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 8da21af81ff..ff24a595502 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -1,17 +1,16 @@ #include "llama.h" #include -#include int main(void) { llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; params.progress_callback = [](float progress, void * ctx){ - std::ignore = ctx; + (void) ctx; return progress > 0.50; }; - auto * model = llama_load_model_from_file("../models/7B/ggml-model-f16.gguf", params); + auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } From 1160de38f6d7f717b2fba61dcb1238ba974f8cc1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 17 Dec 2023 21:25:19 +0200 Subject: [PATCH 04/27] Update llama.cpp Co-authored-by: Jared Van Bortel --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3dbbe0a8059..e67f5e8fce2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9039,7 +9039,7 @@ struct llama_model * llama_load_model_from_file( if (status == -1) { LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); } else if (status == -2) { - LLAMA_LOG_INFO("%s, cancelled model load\n", __func__); + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); } delete model; return nullptr; From 32ebd525bf7e5a87ee8a3dbaab3d92ce79fbf23d Mon Sep 17 00:00:00 2001 From: crasm Date: Sun, 17 Dec 2023 14:31:03 -0500 Subject: [PATCH 05/27] Fail test if model file is missing --- tests/test-model-load-cancel.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index ff24a595502..cb3c012b9d9 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -1,8 +1,20 @@ #include "llama.h" +#include #include int main(void) { + auto model_path = "models/7B/ggml-model-f16.gguf"; + auto file = fopen(model_path, "r"); + + if (file == nullptr) { + fprintf(stderr, "no model at '%s' found\n", model_path); + return EXIT_FAILURE; + } else { + fprintf(stderr, "using '%s'\n", model_path); + fclose(file); + } + llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; @@ -10,7 +22,7 @@ int main(void) { (void) ctx; return progress > 0.50; }; - auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params); + auto * model = llama_load_model_from_file(model_path, params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } From 2796953257ee5383fa7c8fe8fa8fc888c048fb0b Mon Sep 17 00:00:00 2001 From: crasm Date: Sun, 17 Dec 2023 14:37:01 -0500 Subject: [PATCH 06/27] Revert "Fail test if model file is missing" This reverts commit 32ebd525bf7e5a87ee8a3dbaab3d92ce79fbf23d. --- tests/test-model-load-cancel.cpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index cb3c012b9d9..ff24a595502 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -1,20 +1,8 @@ #include "llama.h" -#include #include int main(void) { - auto model_path = "models/7B/ggml-model-f16.gguf"; - auto file = fopen(model_path, "r"); - - if (file == nullptr) { - fprintf(stderr, "no model at '%s' found\n", model_path); - return EXIT_FAILURE; - } else { - fprintf(stderr, "using '%s'\n", model_path); - fclose(file); - } - llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; @@ -22,7 +10,7 @@ int main(void) { (void) ctx; return progress > 0.50; }; - auto * model = llama_load_model_from_file(model_path, params); + auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } From 068e7c408fa4c4f6df4b88fa85da970ff60d27cc Mon Sep 17 00:00:00 2001 From: crasm Date: Sun, 17 Dec 2023 22:22:42 -0500 Subject: [PATCH 07/27] Add test-model-load-cancel to Makefile --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fb775ae5b68..6c126269bdf 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,8 @@ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops + tests/test-backend-ops \ + tests/test-model-load-cancel # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -724,3 +725,5 @@ tests/test-c.o: tests/test-c.c llama.h tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + +tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS) From fe6a6fb6d185444e6c41d8627efbbb8831dc3c34 Mon Sep 17 00:00:00 2001 From: crasm Date: Sun, 17 Dec 2023 22:24:17 -0500 Subject: [PATCH 08/27] Revert "Revert "Fail test if model file is missing"" This reverts commit 2796953257ee5383fa7c8fe8fa8fc888c048fb0b. --- tests/test-model-load-cancel.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index ff24a595502..cb3c012b9d9 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -1,8 +1,20 @@ #include "llama.h" +#include #include int main(void) { + auto model_path = "models/7B/ggml-model-f16.gguf"; + auto file = fopen(model_path, "r"); + + if (file == nullptr) { + fprintf(stderr, "no model at '%s' found\n", model_path); + return EXIT_FAILURE; + } else { + fprintf(stderr, "using '%s'\n", model_path); + fclose(file); + } + llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; @@ -10,7 +22,7 @@ int main(void) { (void) ctx; return progress > 0.50; }; - auto * model = llama_load_model_from_file("models/7B/ggml-model-f16.gguf", params); + auto * model = llama_load_model_from_file(model_path, params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } From 6bba3410fac22179ba45f3bd135a30c78538745e Mon Sep 17 00:00:00 2001 From: crasm Date: Sun, 17 Dec 2023 22:33:38 -0500 Subject: [PATCH 09/27] Simplify .gitignore for tests, clang-tidy fixes --- .gitignore | 16 ---------------- tests/.gitignore | 2 ++ tests/test-model-load-cancel.cpp | 10 +++++----- 3 files changed, 7 insertions(+), 21 deletions(-) create mode 100644 tests/.gitignore diff --git a/.gitignore b/.gitignore index 76b3d286182..7b1a9f9e320 100644 --- a/.gitignore +++ b/.gitignore @@ -86,19 +86,3 @@ examples/jeopardy/results.txt poetry.lock poetry.toml - -# Test binaries -/tests/test-grammar-parser -/tests/test-llama-grammar -/tests/test-double-float -/tests/test-grad0 -/tests/test-opt -/tests/test-quantize-fns -/tests/test-quantize-perf -/tests/test-sampling -/tests/test-tokenizer-0-llama -/tests/test-tokenizer-0-falcon -/tests/test-tokenizer-1-llama -/tests/test-tokenizer-1-bpe -/tests/test-rope -/tests/test-backend-ops diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000000..59be43b9994 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,2 @@ +* +!*.* diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index cb3c012b9d9..926a305da86 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -4,17 +4,17 @@ #include int main(void) { - auto model_path = "models/7B/ggml-model-f16.gguf"; - auto file = fopen(model_path, "r"); + const auto * model_path = "models/7B/ggml-model-f16.gguf"; + auto * file = fopen(model_path, "r"); if (file == nullptr) { fprintf(stderr, "no model at '%s' found\n", model_path); return EXIT_FAILURE; - } else { - fprintf(stderr, "using '%s'\n", model_path); - fclose(file); } + fprintf(stderr, "using '%s'\n", model_path); + fclose(file); + llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; From fd9d247dd2ce2bd0d1d10ee394bd84f9e7e55b23 Mon Sep 17 00:00:00 2001 From: crasm Date: Mon, 18 Dec 2023 04:23:20 -0500 Subject: [PATCH 10/27] Label all ctest tests --- tests/CMakeLists.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e854d27d952..81a02dae92d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,14 +8,20 @@ endfunction() function(llama_test_executable name source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_test(NAME ${name} COMMAND $ ${ARGN}) + set_property(TEST ${name} PROPERTY LABELS "main") endfunction() function(llama_build_and_test_executable source) + llama_build_and_test_executable_with_label(${source} "main") +endfunction() + +function(llama_build_and_test_executable_with_label source label) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE llama common) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) + set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label}) endfunction() # llama_build_and_test_executable(test-double-float.cpp) # SLOW @@ -50,7 +56,8 @@ llama_build_and_test_executable(test-grad0.cpp) llama_build_and_test_executable(test-backend-ops.cpp) llama_build_and_test_executable(test-rope.cpp) -llama_build_and_test_executable(test-model-load-cancel.cpp) + +llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model") # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) From 4b63355f45da7e0e4b0e2396782fc003c8993c66 Mon Sep 17 00:00:00 2001 From: crasm Date: Mon, 18 Dec 2023 04:23:58 -0500 Subject: [PATCH 11/27] ci : ctest uses -L main --- ci/run.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index 2e33438312e..025cb6aa3a6 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -22,9 +22,9 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -v $OUT/*.log -rm -v $OUT/*.exit -rm -v $OUT/*.md +rm -fv $OUT/*.log +rm -fv $OUT/*.exit +rm -fv $OUT/*.md sd=`dirname $0` cd $sd/../ @@ -84,7 +84,7 @@ function gg_run_ctest_debug { (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -113,9 +113,9 @@ function gg_run_ctest_release { (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then - (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e From aed3cf838ccf916c96528d1d111782dd2e0ce9a2 Mon Sep 17 00:00:00 2001 From: crasm Date: Mon, 18 Dec 2023 04:45:39 -0500 Subject: [PATCH 12/27] Attempt at writing ctest_with_model --- ci/run.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ci/run.sh b/ci/run.sh index 025cb6aa3a6..2fb21429948 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -131,6 +131,23 @@ function gg_sum_ctest_release { gg_printf '```\n' } +function gg_run_ctest_with_model { + cd ${SRC} + set -e + (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log + set +e +} + +function gg_sum_ctest_with_model { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs ctest with model files\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)" + gg_printf '```\n' +} + # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { @@ -508,6 +525,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then else test $ret -eq 0 && gg_run open_llama_7b_v2 fi + test $ret -eq 0 && gg_run ctest_with_model fi fi From f80ff4dc6a545f84e0ea949b0b70c38ffa166c40 Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 19 Dec 2023 01:43:27 -0500 Subject: [PATCH 13/27] ci : get ci/run.sh working with test-model-load-cancel --- ci/run.sh | 245 +++++++++++++++++-------------- requirements.txt | 4 +- tests/test-model-load-cancel.cpp | 36 +++-- 3 files changed, 164 insertions(+), 121 deletions(-) diff --git a/ci/run.sh b/ci/run.sh index 2fb21429948..a1b978a0f9f 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,4 +1,4 @@ -#/bin/bash +#!/bin/bash # # sample usage: # @@ -11,6 +11,8 @@ # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # +set -u # Fail on unset variables + if [ -z "$2" ]; then echo "usage: $0 " exit 1 @@ -30,8 +32,20 @@ sd=`dirname $0` cd $sd/../ SRC=`pwd` +# Read-only array of quantization types for iteration. +# Use ${quants[@]:1} to skip f16. +declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k ) + ## helpers +# Print an error message to stderr and exit with an error. +# usage: die +function die { + local format="$1"; shift + >&2 printf "$format" "$@" + exit 1 +} + # download a file if it does not exist or if it is outdated function gg_wget { local out=$1 @@ -77,7 +91,9 @@ function gg_run { function gg_run_ctest_debug { cd ${SRC} - rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug + rm -rf build-ci-debug + mkdir build-ci-debug + cd build-ci-debug set -e @@ -105,14 +121,16 @@ function gg_sum_ctest_debug { function gg_run_ctest_release { cd ${SRC} - rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + rm -rf build-ci-release + mkdir build-ci-release + cd build-ci-release set -e (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - if [ -z ${GG_BUILD_LOW_PERF} ]; then + if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log else (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log @@ -133,6 +151,7 @@ function gg_sum_ctest_release { function gg_run_ctest_with_model { cd ${SRC} + cd build-ci-release set -e (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log set +e @@ -151,81 +170,70 @@ function gg_sum_ctest_with_model { # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { - cd ${SRC} + # We use absolute paths here to not have to track CWD as much + local models_mnt="$(realpath "${SRC}/models-mnt")" + local path_models="${models_mnt}/open-llama/3B-v2" + local path_wiki="${models_mnt}/wikitext" + local path_wiki_raw="${path_wiki}/wikitext-2-raw" - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin - gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json + mkdir -p "${path_models}" "${path_wiki}" - gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip - unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ - head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin + gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - path_models="../models-mnt/open-llama/3B-v2" - path_wiki="../models-mnt/wikitext/wikitext-2-raw" + gg_wget "${path_wiki}" https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}" + head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw" - rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release + rm -rf "${SRC}/build-ci-release" + mkdir "${SRC}/build-ci-release" + cd "${SRC}/build-ci-release" set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log" + (time make -j ) 2>&1 | tee -a "${OUT}/${ci}-make.log" - python3 ../convert.py ${path_models} + python3 "${SRC}/convert.py" "${path_models}" - model_f16="${path_models}/ggml-model-f16.gguf" - model_q8_0="${path_models}/ggml-model-q8_0.gguf" - model_q4_0="${path_models}/ggml-model-q4_0.gguf" - model_q4_1="${path_models}/ggml-model-q4_1.gguf" - model_q5_0="${path_models}/ggml-model-q5_0.gguf" - model_q5_1="${path_models}/ggml-model-q5_1.gguf" - model_q2_k="${path_models}/ggml-model-q2_k.gguf" - model_q3_k="${path_models}/ggml-model-q3_k.gguf" - model_q4_k="${path_models}/ggml-model-q4_k.gguf" - model_q5_k="${path_models}/ggml-model-q5_k.gguf" - model_q6_k="${path_models}/ggml-model-q6_k.gguf" + # Get the model path for a quantization + # usage: model_for + function model_for { + if (( $# != 1 )); then + die 'model_for takes a single quantization, such as q8_0' + fi + echo -n "${path_models}/ggml-model-$1.gguf" + } - wiki_test_60="${path_wiki}/wiki.test-60.raw" + wiki_test_60="${path_wiki_raw}/wiki.test-60.raw" - ./bin/quantize ${model_f16} ${model_q8_0} q8_0 - ./bin/quantize ${model_f16} ${model_q4_0} q4_0 - ./bin/quantize ${model_f16} ${model_q4_1} q4_1 - ./bin/quantize ${model_f16} ${model_q5_0} q5_0 - ./bin/quantize ${model_f16} ${model_q5_1} q5_1 - ./bin/quantize ${model_f16} ${model_q2_k} q2_k - ./bin/quantize ${model_f16} ${model_q3_k} q3_k - ./bin/quantize ${model_f16} ${model_q4_k} q4_k - ./bin/quantize ${model_f16} ${model_q5_k} q5_k - ./bin/quantize ${model_f16} ${model_q6_k} q6_k + # Quantize q8_0 through q6_k + for q in "${quants[@]:1}"; do + ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}" + done - (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - - (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log - (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log - (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log - (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log - (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log - (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log - (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log - (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log - (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log - (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + # Run basic inference for all quants + for q in "${quants[@]}"; do + ( time \ + ./bin/main --model "$(model_for "${q}")" -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" + ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log" + done - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + # Run perplexity with wiki_test_60 + for q in "${quants[@]}"; do + ( time \ + ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2 + ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + done + + # Run examples/save-load-state with q4_0 + ( time \ + ./bin/save-load-state --model "$(model_for q4_0)" + ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log" function check_ppl { qnt="$1" @@ -240,17 +248,11 @@ function gg_run_open_llama_3b_v2 { return 0 } - check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log - check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + # Check perplexity results for all quants + for q in "${quants[@]}"; do + check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log" | grep "^\[1\]")" \ + | tee -a "${OUT}/${ci}-ppl.log" + done # lora function compare_ppl { @@ -267,32 +269,42 @@ function gg_run_open_llama_3b_v2 { return 0 } - path_lora="../models-mnt/open-llama/3B-v2/lora" - path_shakespeare="../models-mnt/shakespeare" + local path_lora="${path_models}/lora" + local path_shakespeare="${models_mnt}/shakespeare" - shakespeare="${path_shakespeare}/shakespeare.txt" - lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + local shakespeare="${path_shakespeare}/shakespeare.txt" + local lora_shakespeare="${path_lora}/ggml-adapter-model.bin" - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json - gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin - gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt - python3 ../convert-lora-to-ggml.py ${path_lora} + python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}" # f16 - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log - (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log - compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log" + (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" + compare_ppl "f16 shakespeare" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \ + | tee -a "${OUT}/${ci}-lora-ppl.log" # q8_0 - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log - compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log" + (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log" + compare_ppl "q8_0 shakespeare" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \ + | tee -a "${OUT}/${ci}-lora-ppl.log" # q8_0 + f16 lora-base - (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log - compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log - + ( time \ + ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2 + ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" + compare_ppl "q8_0 / f16 base shakespeare" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ + "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \ + | tee -a "${OUT}/${ci}-lora-ppl.log" set +e } @@ -502,31 +514,42 @@ function gg_sum_open_llama_7b_v2 { ## main -if [ -z ${GG_BUILD_LOW_PERF} ]; then - rm -rf ${SRC}/models-mnt +ret=0 - mnt_models=${MNT}/models - mkdir -p ${mnt_models} - ln -sfn ${mnt_models} ${SRC}/models-mnt +# This is necessary to test if a variable is set while `set -u` is enabled. +# see: https://stackoverflow.com/a/13864829 +# [[ -z ${var+x} ]] evaluates to false if var is set +# [[ ! -z ${var+x} ]] evaluates to true if var is set +if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then + test "${ret}" -eq 0 && gg_run ctest_debug + test "${ret}" -eq 0 && gg_run ctest_release + exit "${ret}" +fi # Otherwise, do extended testing - python3 -m pip install -r ${SRC}/requirements.txt - python3 -m pip install --editable gguf-py -fi +rm -rf ${SRC}/models-mnt -ret=0 +mnt_models=${MNT}/models +mkdir -p ${mnt_models} +ln -sfn ${mnt_models} ${SRC}/models-mnt + +# Create a fresh python3 venv and enter it +python3 -m venv "${MNT}/venv" +source "${MNT}/venv/bin/activate" + +pip install --disable-pip-version-check -r ${SRC}/requirements.txt +pip install --disable-pip-version-check --editable gguf-py test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release -if [ -z ${GG_BUILD_LOW_PERF} ]; then - if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then - if [ -z ${GG_BUILD_CUDA} ]; then - test $ret -eq 0 && gg_run open_llama_3b_v2 - else - test $ret -eq 0 && gg_run open_llama_7b_v2 - fi - test $ret -eq 0 && gg_run ctest_with_model +# Run tests with open_llama +if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then + if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then + test $ret -eq 0 && gg_run open_llama_7b_v2 + else + test $ret -eq 0 && gg_run open_llama_3b_v2 fi + test $ret -eq 0 && gg_run ctest_with_model fi exit $ret diff --git a/requirements.txt b/requirements.txt index badfec3be80..35713223f8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ +gguf>=0.1.0 numpy==1.24.4 +protobuf==4.25.1 sentencepiece==0.1.98 +torch==2.0.1 transformers>=4.34.0 -gguf>=0.1.0 diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 926a305da86..509f3e8e031 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -2,27 +2,45 @@ #include #include +#include int main(void) { - const auto * model_path = "models/7B/ggml-model-f16.gguf"; - auto * file = fopen(model_path, "r"); + const char * models_to_try[] = { + // Same default as example/main for local use + "./models/7B/ggml-model-f16.gguf", + // Models for ./ci/run.sh + "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf", + "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf", + }; - if (file == nullptr) { - fprintf(stderr, "no model at '%s' found\n", model_path); - return EXIT_FAILURE; + const char * chosen_model; + for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) { + const auto * model = models_to_try[i]; + + auto * file = fopen(model, "r"); + if (file == nullptr) { + continue; + } + + chosen_model = model; + fprintf(stderr, "using '%s'\n", model); + fclose(file); } - fprintf(stderr, "using '%s'\n", model_path); - fclose(file); + if (chosen_model == nullptr) { + fprintf(stderr, "no model found\n"); + return EXIT_FAILURE; + } llama_backend_init(false); auto params = llama_model_params{}; params.use_mmap = false; params.progress_callback = [](float progress, void * ctx){ (void) ctx; - return progress > 0.50; + return progress > 0.05; }; - auto * model = llama_load_model_from_file(model_path, params); + + auto * model = llama_load_model_from_file(chosen_model, params); llama_backend_free(); return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; } From 121b04d121ea8e52709226eaa2da16026f55abf4 Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 19 Dec 2023 02:19:11 -0500 Subject: [PATCH 14/27] ci : restrict .github/workflows/build.yml ctest to -L main --- .github/workflows/build.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a5090e398c1..1d87419401f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -107,7 +107,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -141,7 +141,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest -L main --verbose # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. @@ -202,7 +202,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose --timeout 900 + ctest -L main --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -394,7 +394,7 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 run: | cd build - ctest -C Release --verbose --timeout 900 + ctest -L main -C Release --verbose --timeout 900 - name: Test (Intel SDE) id: cmake_test_sde @@ -406,7 +406,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - & $sde -future -- ctest -C Release --verbose --timeout 900 + & $sde -future -- ctest -L main -C Release --verbose --timeout 900 - name: Determine tag name id: tag From 1e796259101c9f501f7d1ee70641d0e62f1cd1c6 Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 19 Dec 2023 02:42:07 -0500 Subject: [PATCH 15/27] update requirements.txt --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 35713223f8e..d9b430d52a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -gguf>=0.1.0 numpy==1.24.4 -protobuf==4.25.1 sentencepiece==0.1.98 -torch==2.0.1 +torch>=2.0.0 transformers>=4.34.0 +gguf>=0.1.0 +protobuf>=4.21.0 From 9809314bbf9215f0679238e01d0dfbe2bfee5b54 Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 19 Dec 2023 17:46:36 -0500 Subject: [PATCH 16/27] Disable test-model-load-cancel in make --- Makefile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 6c126269bdf..9a1c28fe598 100644 --- a/Makefile +++ b/Makefile @@ -9,8 +9,9 @@ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops \ - tests/test-model-load-cancel + tests/test-backend-ops +# # TODO(crasm): determine how to run tests that depend on openllama model files with make + # tests/test-model-load-cancel # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report From 9a056ed708e06904f909d3550f937863c5ac2248 Mon Sep 17 00:00:00 2001 From: crasm Date: Tue, 19 Dec 2023 20:56:22 -0500 Subject: [PATCH 17/27] Remove venv before creation --- ci/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/run.sh b/ci/run.sh index a1b978a0f9f..9c2b4b3cf11 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -533,6 +533,7 @@ mkdir -p ${mnt_models} ln -sfn ${mnt_models} ${SRC}/models-mnt # Create a fresh python3 venv and enter it +rm -rf "${MNT}/venv" python3 -m venv "${MNT}/venv" source "${MNT}/venv/bin/activate" From 293d16fd40666c7afd1d86089e6c9980708c409e Mon Sep 17 00:00:00 2001 From: crasm Date: Wed, 20 Dec 2023 00:00:08 -0500 Subject: [PATCH 18/27] Restructure requirements.txt Top-level now imports the specific additional requirements for each python file. Using `pip install -r requirements.txt` will fail if versions become mismatched in the per-file requirements. --- ...hf-to-gguf.txt => convert-hf-to-gguf_requirements.txt | 1 - convert-lora-to-ggml_requirements.txt | 1 + convert_requirements.txt | 5 +++++ requirements.txt | 9 +++------ 4 files changed, 9 insertions(+), 7 deletions(-) rename requirements-hf-to-gguf.txt => convert-hf-to-gguf_requirements.txt (62%) create mode 100644 convert-lora-to-ggml_requirements.txt create mode 100644 convert_requirements.txt diff --git a/requirements-hf-to-gguf.txt b/convert-hf-to-gguf_requirements.txt similarity index 62% rename from requirements-hf-to-gguf.txt rename to convert-hf-to-gguf_requirements.txt index f4600539e27..a54ca806752 100644 --- a/requirements-hf-to-gguf.txt +++ b/convert-hf-to-gguf_requirements.txt @@ -1,3 +1,2 @@ --r requirements.txt torch==2.1.1 transformers==4.35.2 diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt new file mode 100644 index 00000000000..8cf89d1906e --- /dev/null +++ b/convert-lora-to-ggml_requirements.txt @@ -0,0 +1 @@ +torch==2.1.1 diff --git a/convert_requirements.txt b/convert_requirements.txt new file mode 100644 index 00000000000..1a116256671 --- /dev/null +++ b/convert_requirements.txt @@ -0,0 +1,5 @@ +numpy==1.24.4 +sentencepiece==0.1.98 +transformers>=4.34.0 +gguf>=0.1.0 +protobuf>=4.21.0 diff --git a/requirements.txt b/requirements.txt index d9b430d52a3..778a20cef6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,3 @@ -numpy==1.24.4 -sentencepiece==0.1.98 -torch>=2.0.0 -transformers>=4.34.0 -gguf>=0.1.0 -protobuf>=4.21.0 +-r convert_requirements.txt +-r convert-hf-to-gguf_requirements.txt +-r convert-lora-to-ggml_requirements.txt From a0eab1ea190692801fa2fe93d1795e480112757a Mon Sep 17 00:00:00 2001 From: crasm Date: Wed, 20 Dec 2023 00:10:31 -0500 Subject: [PATCH 19/27] Make per-python-script requirements work alone This doesn't break the main requirements.txt. --- convert-hf-to-gguf_requirements.txt | 1 + convert-lora-to-ggml_requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/convert-hf-to-gguf_requirements.txt b/convert-hf-to-gguf_requirements.txt index a54ca806752..d295025c978 100644 --- a/convert-hf-to-gguf_requirements.txt +++ b/convert-hf-to-gguf_requirements.txt @@ -1,2 +1,3 @@ +-r convert_requirements.txt torch==2.1.1 transformers==4.35.2 diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt index 8cf89d1906e..f9481c12801 100644 --- a/convert-lora-to-ggml_requirements.txt +++ b/convert-lora-to-ggml_requirements.txt @@ -1 +1,2 @@ +-r convert_requirements.txt torch==2.1.1 From ca122dc9e007e34c6ba5a8d4d89bb5f9d50b6d52 Mon Sep 17 00:00:00 2001 From: crasm Date: Wed, 20 Dec 2023 00:14:56 -0500 Subject: [PATCH 20/27] Add comment --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index 778a20cef6a..0f0147cd850 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,8 @@ +# These requirements include all dependencies for all top-level python scripts +# for llama.cpp. Avoid adding packages here directly. +# +# Package versions must stay compatible across all top-level python scripts. +# -r convert_requirements.txt -r convert-hf-to-gguf_requirements.txt -r convert-lora-to-ggml_requirements.txt From b853df4207bf763d672b0768ff83414971dca90a Mon Sep 17 00:00:00 2001 From: crasm Date: Wed, 20 Dec 2023 03:32:22 -0500 Subject: [PATCH 21/27] Add convert-persimmon-to-gguf.py to new requirements.txt scheme --- convert-persimmon-to-gguf.py | 1 + convert-persimmon-to-gguf_requirements.txt | 2 ++ requirements.txt | 3 +++ 3 files changed, 6 insertions(+) mode change 100644 => 100755 convert-persimmon-to-gguf.py create mode 100644 convert-persimmon-to-gguf_requirements.txt diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py old mode 100644 new mode 100755 index 206b7d5ff9e..1ba5864dc25 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 import torch import os from pprint import pprint diff --git a/convert-persimmon-to-gguf_requirements.txt b/convert-persimmon-to-gguf_requirements.txt new file mode 100644 index 00000000000..f9481c12801 --- /dev/null +++ b/convert-persimmon-to-gguf_requirements.txt @@ -0,0 +1,2 @@ +-r convert_requirements.txt +torch==2.1.1 diff --git a/requirements.txt b/requirements.txt index 0f0147cd850..c946b5e4c6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,9 @@ # # Package versions must stay compatible across all top-level python scripts. # + -r convert_requirements.txt + -r convert-hf-to-gguf_requirements.txt -r convert-lora-to-ggml_requirements.txt +-r convert-persimmon-to-gguf_requirements.txt From c9a6de8f8aed0b96b6369f45e24d7920ed66807b Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 21 Dec 2023 04:16:41 -0500 Subject: [PATCH 22/27] Add check-requirements.sh script and GitHub workflow --- .../workflows/python-check-requirements.yml | 31 ++++ check-requirements.sh | 139 ++++++++++++++++++ convert-lora-to-ggml_requirements.txt | 2 - convert-persimmon-to-gguf_requirements.txt | 2 - ...txt => requirements-convert-hf-to-gguf.txt | 2 +- requirements-convert-llama-ggml-to-gguf.txt | 1 + requirements-convert-lora-to-ggml.txt | 2 + requirements-convert-persimmon-to-gguf.txt | 2 + ...quirements.txt => requirements-convert.txt | 0 requirements.txt | 8 +- 10 files changed, 180 insertions(+), 9 deletions(-) create mode 100644 .github/workflows/python-check-requirements.yml create mode 100755 check-requirements.sh delete mode 100644 convert-lora-to-ggml_requirements.txt delete mode 100644 convert-persimmon-to-gguf_requirements.txt rename convert-hf-to-gguf_requirements.txt => requirements-convert-hf-to-gguf.txt (54%) create mode 100644 requirements-convert-llama-ggml-to-gguf.txt create mode 100644 requirements-convert-lora-to-ggml.txt create mode 100644 requirements-convert-persimmon-to-gguf.txt rename convert_requirements.txt => requirements-convert.txt (100%) diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml new file mode 100644 index 00000000000..c7929a20cce --- /dev/null +++ b/.github/workflows/python-check-requirements.yml @@ -0,0 +1,31 @@ +name: Python check requirements.txt + +on: + push: + paths: + - 'check-requirements.sh' + - 'convert*.py' + - 'requirements*.txt' + pull_request: + paths: + - 'check-requirements.sh' + - 'convert*.py' + - 'requirements*.txt' + +jobs: + python-check-requirements: + runs-on: ubuntu-latest + name: check-requirements + steps: + - name: Install shellcheck + run: | + sudo apt-get update + sudo apt-get install shellcheck + - name: Check out source repository + uses: actions/checkout@v3 + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: "3.11" + - name: Run check-requirements.sh script + run: bash check-requirements.sh diff --git a/check-requirements.sh b/check-requirements.sh new file mode 100755 index 00000000000..881b8f190fd --- /dev/null +++ b/check-requirements.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# +# check-requirements.sh checks all requirements files for each top-level +# convert*.py script. +# +# WARNING: This is quite IO intensive, because a fresh venv is set up for every +# python script. +# +# requires: +# * bash >= 3.2.57 +# * shellcheck +# +# For each script, it creates a fresh venv, `pip install -r` the +# requirements, and finally executes the python script with no arguments to +# check for a `ModuleNotFoundError`. +# + +log() { + local level="$1"; shift + local format="$1"; shift + # shellcheck disable=SC2059 + >&2 printf "$level: $format\n" "$@" +} + +info() { + log 'INFO' "$@" +} + +fatal() { + log 'FATAL' "$@" + exit 1 +} + +cleanup() { + if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then + info "Removing $workdir" + ( + count=0 + rm -rfv "$workdir" | while read -r; do + if (( count++ > 750 )); then + printf '.' + count=0 + fi + done + printf '\n' + )& + wait $! + info "Removed '$workdir'" + fi +} + +abort() { + cleanup + exit 1 +} + +trap abort SIGINT SIGTERM SIGQUIT SIGABRT +trap cleanup EXIT + +set -eu -o pipefail +this="$(realpath "$0")" +readonly this +cd "$(dirname "$this")" + +shellcheck "$this" + +workdir= +if [[ -n ${1+x} ]]; then + arg_dir="$(realpath "$1")" + if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then + fatal "$arg_dir is not a valid directory" + fi + workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")" +else + workdir="$(mktemp -d "/tmp/check-requirements.XXXX")" +fi +readonly workdir + +info "Working directory: $workdir" + +assert_arg_count() { + local argcount="$1"; shift + if (( $# != argcount )); then + fatal "${FUNCNAME[1]}: incorrect number of args" + fi +} + +check_requirements() { + assert_arg_count 2 "$@" + local venv="$1" + local reqs="$2" + + info "$reqs: beginning check" + ( + # shellcheck source=/dev/null + source "$venv/bin/activate" + pip --disable-pip-version-check install -q -r "$reqs" + ) + info "$reqs: OK" +} + +check_convert_script() { + assert_arg_count 1 "$@" + local py="$1" + local pyname="${py%.py}" + + info "$py: beginning check" + + local reqs="requirements-$pyname.txt" + local venv="$workdir/$pyname-venv" + python3 -m venv "$venv" + + check_requirements "$venv" "$reqs" + set +e + ( + # shellcheck source=/dev/null + source "$venv/bin/activate" + py_err="$workdir/$pyname.out" + python "$py" 2> "$py_err" + >&2 cat "$py_err" + grep -e 'ModuleNotFoundError' "$py_err" + ) + set -e + # shellcheck disable=SC2181 + (( $? )) && fatal "$py: some imports not declared in $reqs" + info "$py: imports OK" +} + +# Check requirements.txt +all_venv="$workdir/all-venv" +python3 -m venv "$all_venv" +check_requirements "$all_venv" 'requirements.txt' + +check_convert_script 'convert.py' +for py in convert-*.py; do + check_convert_script "$py" +done + +info "Done! No issues found." diff --git a/convert-lora-to-ggml_requirements.txt b/convert-lora-to-ggml_requirements.txt deleted file mode 100644 index f9481c12801..00000000000 --- a/convert-lora-to-ggml_requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ --r convert_requirements.txt -torch==2.1.1 diff --git a/convert-persimmon-to-gguf_requirements.txt b/convert-persimmon-to-gguf_requirements.txt deleted file mode 100644 index f9481c12801..00000000000 --- a/convert-persimmon-to-gguf_requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ --r convert_requirements.txt -torch==2.1.1 diff --git a/convert-hf-to-gguf_requirements.txt b/requirements-convert-hf-to-gguf.txt similarity index 54% rename from convert-hf-to-gguf_requirements.txt rename to requirements-convert-hf-to-gguf.txt index d295025c978..4d00b196661 100644 --- a/convert-hf-to-gguf_requirements.txt +++ b/requirements-convert-hf-to-gguf.txt @@ -1,3 +1,3 @@ --r convert_requirements.txt +-r requirements-convert.txt torch==2.1.1 transformers==4.35.2 diff --git a/requirements-convert-llama-ggml-to-gguf.txt b/requirements-convert-llama-ggml-to-gguf.txt new file mode 100644 index 00000000000..8a5377762c1 --- /dev/null +++ b/requirements-convert-llama-ggml-to-gguf.txt @@ -0,0 +1 @@ +-r requirements-convert.txt diff --git a/requirements-convert-lora-to-ggml.txt b/requirements-convert-lora-to-ggml.txt new file mode 100644 index 00000000000..30827c8964d --- /dev/null +++ b/requirements-convert-lora-to-ggml.txt @@ -0,0 +1,2 @@ +-r requirements-convert.txt +torch==2.1.1 diff --git a/requirements-convert-persimmon-to-gguf.txt b/requirements-convert-persimmon-to-gguf.txt new file mode 100644 index 00000000000..30827c8964d --- /dev/null +++ b/requirements-convert-persimmon-to-gguf.txt @@ -0,0 +1,2 @@ +-r requirements-convert.txt +torch==2.1.1 diff --git a/convert_requirements.txt b/requirements-convert.txt similarity index 100% rename from convert_requirements.txt rename to requirements-convert.txt diff --git a/requirements.txt b/requirements.txt index c946b5e4c6c..da4f3f9a874 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ # Package versions must stay compatible across all top-level python scripts. # --r convert_requirements.txt +-r requirements-convert.txt --r convert-hf-to-gguf_requirements.txt --r convert-lora-to-ggml_requirements.txt --r convert-persimmon-to-gguf_requirements.txt +-r requirements-convert-hf-to-gguf.txt +-r requirements-convert-lora-to-ggml.txt +-r requirements-convert-persimmon-to-gguf.txt From e86b8cd93a5a979de12f18ad5fc73dbacf229448 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 21 Dec 2023 04:28:58 -0500 Subject: [PATCH 23/27] Remove shellcheck installation step from workflow --- .github/workflows/python-check-requirements.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index c7929a20cce..49c992b4ad0 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -17,10 +17,6 @@ jobs: runs-on: ubuntu-latest name: check-requirements steps: - - name: Install shellcheck - run: | - sudo apt-get update - sudo apt-get install shellcheck - name: Check out source repository uses: actions/checkout@v3 - name: Set up Python environment From bdfe4ba85c72df7fb521f7222a9dca017177d734 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 21 Dec 2023 04:55:28 -0500 Subject: [PATCH 24/27] Add nocleanup special arg --- .../workflows/python-check-requirements.yml | 2 +- check-requirements.sh | 25 ++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 49c992b4ad0..cc97ee81005 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -24,4 +24,4 @@ jobs: with: python-version: "3.11" - name: Run check-requirements.sh script - run: bash check-requirements.sh + run: bash check-requirements.sh nocleanup diff --git a/check-requirements.sh b/check-requirements.sh index 881b8f190fd..ac6beb60492 100755 --- a/check-requirements.sh +++ b/check-requirements.sh @@ -6,9 +6,18 @@ # WARNING: This is quite IO intensive, because a fresh venv is set up for every # python script. # +# usage: ./check-requirements.sh [] +# ./check-requirements.sh 'nocleanup' [] +# +# where: +# - is a directory that can be used as the base for +# setting up the venvs. Defaults to `/tmp`. +# - 'nocleanup' as the first argument will disable automatic cleanup +# of the files created by this script. +# # requires: -# * bash >= 3.2.57 -# * shellcheck +# - bash >= 3.2.57 +# - shellcheck # # For each script, it creates a fresh venv, `pip install -r` the # requirements, and finally executes the python script with no arguments to @@ -54,8 +63,12 @@ abort() { exit 1 } -trap abort SIGINT SIGTERM SIGQUIT SIGABRT -trap cleanup EXIT +if [[ $1 == nocleanup ]]; then + shift # discard nocleanup arg +else + trap abort SIGINT SIGTERM SIGQUIT SIGABRT + trap cleanup EXIT +fi set -eu -o pipefail this="$(realpath "$0")" @@ -107,6 +120,10 @@ check_convert_script() { info "$py: beginning check" local reqs="requirements-$pyname.txt" + if [[ ! -r "$reqs" ]]; then + fatal "$py missing requirements. Expected: $reqs" + fi + local venv="$workdir/$pyname-venv" python3 -m venv "$venv" From e4382571ca3ca4a0ef97f7b3f26bd19d579891c8 Mon Sep 17 00:00:00 2001 From: crasm Date: Thu, 21 Dec 2023 18:54:27 -0500 Subject: [PATCH 25/27] Fix merge see: https://github.com/ggerganov/llama.cpp/pull/4462#discussion_r1434593573 --- llama.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llama.cpp b/llama.cpp index c64a1fa0b6f..cb0546c952d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2469,7 +2469,9 @@ struct llama_model_loader { } if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); } return true; } @@ -3060,8 +3062,6 @@ static bool llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - bool ok = true; // if false, model load was cancelled - auto & ctx = model.ctx; auto & hparams = model.hparams; @@ -3729,11 +3729,8 @@ static bool llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - ok = ok && ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - ok = ok && progress_callback(1.0f, progress_callback_user_data); + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { + return false; } model.mapping = std::move(ml.mapping); @@ -3741,7 +3738,7 @@ static bool llm_load_tensors( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; - return ok; + return true; } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback From f607e5325214a2c10f8db772061f521f4e7ac7ee Mon Sep 17 00:00:00 2001 From: crasm Date: Fri, 22 Dec 2023 00:58:32 -0500 Subject: [PATCH 26/27] reset to upstream/master --- .github/workflows/build.yml | 12 +- .../workflows/python-check-requirements.yml | 27 -- .gitignore | 16 + Makefile | 4 - check-requirements.sh | 156 ---------- ci/run.sh | 274 ++++++++---------- convert-persimmon-to-gguf.py | 1 - llama.cpp | 46 +-- llama.h | 6 +- requirements-convert-llama-ggml-to-gguf.txt | 1 - requirements-convert-lora-to-ggml.txt | 2 - requirements-convert-persimmon-to-gguf.txt | 2 - requirements-convert.txt | 5 - ...to-gguf.txt => requirements-hf-to-gguf.txt | 2 +- requirements.txt | 16 +- tests/.gitignore | 2 - tests/CMakeLists.txt | 8 - tests/test-model-load-cancel.cpp | 46 --- 18 files changed, 159 insertions(+), 467 deletions(-) delete mode 100644 .github/workflows/python-check-requirements.yml delete mode 100755 check-requirements.sh mode change 100755 => 100644 convert-persimmon-to-gguf.py delete mode 100644 requirements-convert-llama-ggml-to-gguf.txt delete mode 100644 requirements-convert-lora-to-ggml.txt delete mode 100644 requirements-convert-persimmon-to-gguf.txt delete mode 100644 requirements-convert.txt rename requirements-convert-hf-to-gguf.txt => requirements-hf-to-gguf.txt (54%) delete mode 100644 tests/.gitignore delete mode 100644 tests/test-model-load-cancel.cpp diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1d87419401f..a5090e398c1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -72,7 +72,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -107,7 +107,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest @@ -141,7 +141,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose + ctest --verbose # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know # how to debug it. @@ -202,7 +202,7 @@ jobs: id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest --verbose --timeout 900 macOS-latest-cmake-ios: runs-on: macos-latest @@ -394,7 +394,7 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512 run: | cd build - ctest -L main -C Release --verbose --timeout 900 + ctest -C Release --verbose --timeout 900 - name: Test (Intel SDE) id: cmake_test_sde @@ -406,7 +406,7 @@ jobs: 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe) cd build - & $sde -future -- ctest -L main -C Release --verbose --timeout 900 + & $sde -future -- ctest -C Release --verbose --timeout 900 - name: Determine tag name id: tag diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml deleted file mode 100644 index cc97ee81005..00000000000 --- a/.github/workflows/python-check-requirements.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Python check requirements.txt - -on: - push: - paths: - - 'check-requirements.sh' - - 'convert*.py' - - 'requirements*.txt' - pull_request: - paths: - - 'check-requirements.sh' - - 'convert*.py' - - 'requirements*.txt' - -jobs: - python-check-requirements: - runs-on: ubuntu-latest - name: check-requirements - steps: - - name: Check out source repository - uses: actions/checkout@v3 - - name: Set up Python environment - uses: actions/setup-python@v4 - with: - python-version: "3.11" - - name: Run check-requirements.sh script - run: bash check-requirements.sh nocleanup diff --git a/.gitignore b/.gitignore index 7b1a9f9e320..76b3d286182 100644 --- a/.gitignore +++ b/.gitignore @@ -86,3 +86,19 @@ examples/jeopardy/results.txt poetry.lock poetry.toml + +# Test binaries +/tests/test-grammar-parser +/tests/test-llama-grammar +/tests/test-double-float +/tests/test-grad0 +/tests/test-opt +/tests/test-quantize-fns +/tests/test-quantize-perf +/tests/test-sampling +/tests/test-tokenizer-0-llama +/tests/test-tokenizer-0-falcon +/tests/test-tokenizer-1-llama +/tests/test-tokenizer-1-bpe +/tests/test-rope +/tests/test-backend-ops diff --git a/Makefile b/Makefile index b5ce2e2dae6..68df7702aa9 100644 --- a/Makefile +++ b/Makefile @@ -10,8 +10,6 @@ TEST_TARGETS = \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ tests/test-backend-ops -# # TODO(crasm): determine how to run tests that depend on openllama model files with make - # tests/test-model-load-cancel # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -732,5 +730,3 @@ tests/test-c.o: tests/test-c.c llama.h tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) - -tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o $(OBJS) diff --git a/check-requirements.sh b/check-requirements.sh deleted file mode 100755 index ac6beb60492..00000000000 --- a/check-requirements.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# -# check-requirements.sh checks all requirements files for each top-level -# convert*.py script. -# -# WARNING: This is quite IO intensive, because a fresh venv is set up for every -# python script. -# -# usage: ./check-requirements.sh [] -# ./check-requirements.sh 'nocleanup' [] -# -# where: -# - is a directory that can be used as the base for -# setting up the venvs. Defaults to `/tmp`. -# - 'nocleanup' as the first argument will disable automatic cleanup -# of the files created by this script. -# -# requires: -# - bash >= 3.2.57 -# - shellcheck -# -# For each script, it creates a fresh venv, `pip install -r` the -# requirements, and finally executes the python script with no arguments to -# check for a `ModuleNotFoundError`. -# - -log() { - local level="$1"; shift - local format="$1"; shift - # shellcheck disable=SC2059 - >&2 printf "$level: $format\n" "$@" -} - -info() { - log 'INFO' "$@" -} - -fatal() { - log 'FATAL' "$@" - exit 1 -} - -cleanup() { - if [[ -n ${workdir+x} && -d $workdir && -w $workdir ]]; then - info "Removing $workdir" - ( - count=0 - rm -rfv "$workdir" | while read -r; do - if (( count++ > 750 )); then - printf '.' - count=0 - fi - done - printf '\n' - )& - wait $! - info "Removed '$workdir'" - fi -} - -abort() { - cleanup - exit 1 -} - -if [[ $1 == nocleanup ]]; then - shift # discard nocleanup arg -else - trap abort SIGINT SIGTERM SIGQUIT SIGABRT - trap cleanup EXIT -fi - -set -eu -o pipefail -this="$(realpath "$0")" -readonly this -cd "$(dirname "$this")" - -shellcheck "$this" - -workdir= -if [[ -n ${1+x} ]]; then - arg_dir="$(realpath "$1")" - if [[ ! ( -d $arg_dir && -w $arg_dir ) ]]; then - fatal "$arg_dir is not a valid directory" - fi - workdir="$(mktemp -d "$arg_dir/check-requirements.XXXX")" -else - workdir="$(mktemp -d "/tmp/check-requirements.XXXX")" -fi -readonly workdir - -info "Working directory: $workdir" - -assert_arg_count() { - local argcount="$1"; shift - if (( $# != argcount )); then - fatal "${FUNCNAME[1]}: incorrect number of args" - fi -} - -check_requirements() { - assert_arg_count 2 "$@" - local venv="$1" - local reqs="$2" - - info "$reqs: beginning check" - ( - # shellcheck source=/dev/null - source "$venv/bin/activate" - pip --disable-pip-version-check install -q -r "$reqs" - ) - info "$reqs: OK" -} - -check_convert_script() { - assert_arg_count 1 "$@" - local py="$1" - local pyname="${py%.py}" - - info "$py: beginning check" - - local reqs="requirements-$pyname.txt" - if [[ ! -r "$reqs" ]]; then - fatal "$py missing requirements. Expected: $reqs" - fi - - local venv="$workdir/$pyname-venv" - python3 -m venv "$venv" - - check_requirements "$venv" "$reqs" - set +e - ( - # shellcheck source=/dev/null - source "$venv/bin/activate" - py_err="$workdir/$pyname.out" - python "$py" 2> "$py_err" - >&2 cat "$py_err" - grep -e 'ModuleNotFoundError' "$py_err" - ) - set -e - # shellcheck disable=SC2181 - (( $? )) && fatal "$py: some imports not declared in $reqs" - info "$py: imports OK" -} - -# Check requirements.txt -all_venv="$workdir/all-venv" -python3 -m venv "$all_venv" -check_requirements "$all_venv" 'requirements.txt' - -check_convert_script 'convert.py' -for py in convert-*.py; do - check_convert_script "$py" -done - -info "Done! No issues found." diff --git a/ci/run.sh b/ci/run.sh index 9c2b4b3cf11..2e33438312e 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#/bin/bash # # sample usage: # @@ -11,8 +11,6 @@ # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt # -set -u # Fail on unset variables - if [ -z "$2" ]; then echo "usage: $0 " exit 1 @@ -24,28 +22,16 @@ mkdir -p "$2" OUT=$(realpath "$1") MNT=$(realpath "$2") -rm -fv $OUT/*.log -rm -fv $OUT/*.exit -rm -fv $OUT/*.md +rm -v $OUT/*.log +rm -v $OUT/*.exit +rm -v $OUT/*.md sd=`dirname $0` cd $sd/../ SRC=`pwd` -# Read-only array of quantization types for iteration. -# Use ${quants[@]:1} to skip f16. -declare -ra quants=( f16 q8_0 q4_0 q4_1 q5_0 q5_1 q2_k q3_k q4_k q5_k q6_k ) - ## helpers -# Print an error message to stderr and exit with an error. -# usage: die -function die { - local format="$1"; shift - >&2 printf "$format" "$@" - exit 1 -} - # download a file if it does not exist or if it is outdated function gg_wget { local out=$1 @@ -91,16 +77,14 @@ function gg_run { function gg_run_ctest_debug { cd ${SRC} - rm -rf build-ci-debug - mkdir build-ci-debug - cd build-ci-debug + rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug set -e (time cmake -DCMAKE_BUILD_TYPE=Debug .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log set +e } @@ -121,19 +105,17 @@ function gg_sum_ctest_debug { function gg_run_ctest_release { cd ${SRC} - rm -rf build-ci-release - mkdir build-ci-release - cd build-ci-release + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release set -e (time cmake -DCMAKE_BUILD_TYPE=Release .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - if [[ -z ${GG_BUILD_LOW_PERF+x} ]]; then - (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log + if [ -z ${GG_BUILD_LOW_PERF} ]; then + (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log else - (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi set +e @@ -149,91 +131,84 @@ function gg_sum_ctest_release { gg_printf '```\n' } -function gg_run_ctest_with_model { - cd ${SRC} - cd build-ci-release - set -e - (time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest_with_model.log - set +e -} - -function gg_sum_ctest_with_model { - gg_printf '### %s\n\n' "${ci}" - - gg_printf 'Runs ctest with model files\n' - gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" - gg_printf '```\n' - gg_printf '%s\n' "$(cat $OUT/${ci}-ctest_with_model.log)" - gg_printf '```\n' -} - # open_llama_3b_v2 function gg_run_open_llama_3b_v2 { - # We use absolute paths here to not have to track CWD as much - local models_mnt="$(realpath "${SRC}/models-mnt")" - local path_models="${models_mnt}/open-llama/3B-v2" - local path_wiki="${models_mnt}/wikitext" - local path_wiki_raw="${path_wiki}/wikitext-2-raw" + cd ${SRC} - mkdir -p "${path_models}" "${path_wiki}" + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin + gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin - gg_wget "${path_models}" https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json + gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip + unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/ + head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw - gg_wget "${path_wiki}" https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip - unzip -o "${path_wiki}/wikitext-2-raw-v1.zip" -d "${path_wiki}" - head -n 60 "${path_wiki_raw}/wiki.test.raw" > "${path_wiki_raw}/wiki.test-60.raw" + path_models="../models-mnt/open-llama/3B-v2" + path_wiki="../models-mnt/wikitext/wikitext-2-raw" - rm -rf "${SRC}/build-ci-release" - mkdir "${SRC}/build-ci-release" - cd "${SRC}/build-ci-release" + rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release set -e - (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a "${OUT}/${ci}-cmake.log" - (time make -j ) 2>&1 | tee -a "${OUT}/${ci}-make.log" - - python3 "${SRC}/convert.py" "${path_models}" + (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log - # Get the model path for a quantization - # usage: model_for - function model_for { - if (( $# != 1 )); then - die 'model_for takes a single quantization, such as q8_0' - fi - echo -n "${path_models}/ggml-model-$1.gguf" - } + python3 ../convert.py ${path_models} - wiki_test_60="${path_wiki_raw}/wiki.test-60.raw" + model_f16="${path_models}/ggml-model-f16.gguf" + model_q8_0="${path_models}/ggml-model-q8_0.gguf" + model_q4_0="${path_models}/ggml-model-q4_0.gguf" + model_q4_1="${path_models}/ggml-model-q4_1.gguf" + model_q5_0="${path_models}/ggml-model-q5_0.gguf" + model_q5_1="${path_models}/ggml-model-q5_1.gguf" + model_q2_k="${path_models}/ggml-model-q2_k.gguf" + model_q3_k="${path_models}/ggml-model-q3_k.gguf" + model_q4_k="${path_models}/ggml-model-q4_k.gguf" + model_q5_k="${path_models}/ggml-model-q5_k.gguf" + model_q6_k="${path_models}/ggml-model-q6_k.gguf" - # Quantize q8_0 through q6_k - for q in "${quants[@]:1}"; do - ./bin/quantize "$(model_for f16)" "$(model_for "${q}")" "${q}" - done + wiki_test_60="${path_wiki}/wiki.test-60.raw" - # Run basic inference for all quants - for q in "${quants[@]}"; do - ( time \ - ./bin/main --model "$(model_for "${q}")" -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" - ) 2>&1 | tee -a "${OUT}/${ci}-tg-${q}.log" - done + ./bin/quantize ${model_f16} ${model_q8_0} q8_0 + ./bin/quantize ${model_f16} ${model_q4_0} q4_0 + ./bin/quantize ${model_f16} ${model_q4_1} q4_1 + ./bin/quantize ${model_f16} ${model_q5_0} q5_0 + ./bin/quantize ${model_f16} ${model_q5_1} q5_1 + ./bin/quantize ${model_f16} ${model_q2_k} q2_k + ./bin/quantize ${model_f16} ${model_q3_k} q3_k + ./bin/quantize ${model_f16} ${model_q4_k} q4_k + ./bin/quantize ${model_f16} ${model_q5_k} q5_k + ./bin/quantize ${model_f16} ${model_q6_k} q6_k - # Run perplexity with wiki_test_60 - for q in "${quants[@]}"; do - ( time \ - ./bin/perplexity --model "$(model_for $q)" -f "${wiki_test_60}" -c 128 -b 128 --chunks 2 - ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log - done + (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log + + (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log + (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log + (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log + (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log + (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log + (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log + (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log + (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log + (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log + (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log + (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log - # Run examples/save-load-state with q4_0 - ( time \ - ./bin/save-load-state --model "$(model_for q4_0)" - ) 2>&1 | tee -a "${OUT}/${ci}-save-load-state.log" + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -248,11 +223,17 @@ function gg_run_open_llama_3b_v2 { return 0 } - # Check perplexity results for all quants - for q in "${quants[@]}"; do - check_ppl "$q" "$(cat "${OUT}/${ci}-tg-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-ppl.log" - done + check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log + check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # lora function compare_ppl { @@ -269,42 +250,32 @@ function gg_run_open_llama_3b_v2 { return 0 } - local path_lora="${path_models}/lora" - local path_shakespeare="${models_mnt}/shakespeare" + path_lora="../models-mnt/open-llama/3B-v2/lora" + path_shakespeare="../models-mnt/shakespeare" - local shakespeare="${path_shakespeare}/shakespeare.txt" - local lora_shakespeare="${path_lora}/ggml-adapter-model.bin" + shakespeare="${path_shakespeare}/shakespeare.txt" + lora_shakespeare="${path_lora}/ggml-adapter-model.bin" - gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json - gg_wget "${path_lora}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin - gg_wget "${path_shakespeare}" https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json + gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin + gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt - python3 "${SRC}/convert-lora-to-ggml.py" "${path_lora}" + python3 ../convert-lora-to-ggml.py ${path_lora} # f16 - (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-f16.log" - (time ./bin/perplexity --model "$(model_for f16)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" - compare_ppl "f16 shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-f16.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log + (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log + compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 - (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-q8_0.log" - (time ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a "$OUT/${ci}-ppl-shakespeare-lora-q8_0.log" - compare_ppl "q8_0 shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log + compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log # q8_0 + f16 lora-base - ( time \ - ./bin/perplexity --model "$(model_for q8_0)" -f "${shakespeare}" --lora "${lora_shakespeare}" --lora-base "$(model_for f16)" -c 128 -b 128 --chunks 2 - ) 2>&1 | tee -a "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" - compare_ppl "q8_0 / f16 base shakespeare" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-q8_0.log" | grep "^\[1\]")" \ - "$(cat "${OUT}/${ci}-ppl-shakespeare-lora-q8_0-f16.log" | grep "^\[1\]")" \ - | tee -a "${OUT}/${ci}-lora-ppl.log" + (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log + compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log + set +e } @@ -514,43 +485,30 @@ function gg_sum_open_llama_7b_v2 { ## main -ret=0 - -# This is necessary to test if a variable is set while `set -u` is enabled. -# see: https://stackoverflow.com/a/13864829 -# [[ -z ${var+x} ]] evaluates to false if var is set -# [[ ! -z ${var+x} ]] evaluates to true if var is set -if [[ ! -z ${GG_BUILD_LOW_PERF+x} ]]; then - test "${ret}" -eq 0 && gg_run ctest_debug - test "${ret}" -eq 0 && gg_run ctest_release - exit "${ret}" -fi # Otherwise, do extended testing - -rm -rf ${SRC}/models-mnt +if [ -z ${GG_BUILD_LOW_PERF} ]; then + rm -rf ${SRC}/models-mnt -mnt_models=${MNT}/models -mkdir -p ${mnt_models} -ln -sfn ${mnt_models} ${SRC}/models-mnt + mnt_models=${MNT}/models + mkdir -p ${mnt_models} + ln -sfn ${mnt_models} ${SRC}/models-mnt -# Create a fresh python3 venv and enter it -rm -rf "${MNT}/venv" -python3 -m venv "${MNT}/venv" -source "${MNT}/venv/bin/activate" + python3 -m pip install -r ${SRC}/requirements.txt + python3 -m pip install --editable gguf-py +fi -pip install --disable-pip-version-check -r ${SRC}/requirements.txt -pip install --disable-pip-version-check --editable gguf-py +ret=0 test $ret -eq 0 && gg_run ctest_debug test $ret -eq 0 && gg_run ctest_release -# Run tests with open_llama -if [[ -z ${GG_BUILD_VRAM_GB+x} ]] || (( GG_BUILD_VRAM_GB >= 8 )); then - if [[ ! -z ${GG_BUILD_CUDA+x} ]]; then - test $ret -eq 0 && gg_run open_llama_7b_v2 - else - test $ret -eq 0 && gg_run open_llama_3b_v2 +if [ -z ${GG_BUILD_LOW_PERF} ]; then + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then + if [ -z ${GG_BUILD_CUDA} ]; then + test $ret -eq 0 && gg_run open_llama_3b_v2 + else + test $ret -eq 0 && gg_run open_llama_7b_v2 + fi fi - test $ret -eq 0 && gg_run ctest_with_model fi exit $ret diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py old mode 100755 new mode 100644 index 1ba5864dc25..206b7d5ff9e --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 import torch import os from pprint import pprint diff --git a/llama.cpp b/llama.cpp index cb0546c952d..d6c192441fb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2372,8 +2372,7 @@ struct llama_model_loader { } } - // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { + void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2405,9 +2404,7 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { - return false; - } + progress_callback((float) size_done / size_data, progress_callback_user_data); } const size_t offs = file_offset(ggml_get_name(cur)); @@ -2469,11 +2466,8 @@ struct llama_model_loader { } if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); + progress_callback(1.0f, progress_callback_user_data); } - return true; } }; @@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -// Returns false if cancelled by progress_callback -static bool llm_load_tensors( +static void llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -3729,20 +3722,16 @@ static bool llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { - return false; - } + ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; - return true; } -// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return 0; + return true; } - if (!llm_load_tensors( + llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - )) { - return -2; - } + ); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return -1; + return false; } - return 0; + return true; } // @@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } - return true; }; } - int status = llama_model_load(path_model, *model, params); - GGML_ASSERT(status <= 0); - if (status < 0) { - if (status == -1) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); - } else if (status == -2) { - LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); - } + if (!llama_model_load(path_model, *model, params)) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); delete model; return nullptr; } diff --git a/llama.h b/llama.h index af76bae2d2a..0be4b1337b9 100644 --- a/llama.h +++ b/llama.h @@ -127,7 +127,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void *ctx); + typedef void (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -180,9 +180,7 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. - // If the provided progress_callback returns true, model loading continues. - // If it returns false, model loading is immediately aborted. + // called with a progress value between 0 and 1, pass NULL to disable llama_progress_callback progress_callback; // context pointer passed to the progress callback diff --git a/requirements-convert-llama-ggml-to-gguf.txt b/requirements-convert-llama-ggml-to-gguf.txt deleted file mode 100644 index 8a5377762c1..00000000000 --- a/requirements-convert-llama-ggml-to-gguf.txt +++ /dev/null @@ -1 +0,0 @@ --r requirements-convert.txt diff --git a/requirements-convert-lora-to-ggml.txt b/requirements-convert-lora-to-ggml.txt deleted file mode 100644 index 30827c8964d..00000000000 --- a/requirements-convert-lora-to-ggml.txt +++ /dev/null @@ -1,2 +0,0 @@ --r requirements-convert.txt -torch==2.1.1 diff --git a/requirements-convert-persimmon-to-gguf.txt b/requirements-convert-persimmon-to-gguf.txt deleted file mode 100644 index 30827c8964d..00000000000 --- a/requirements-convert-persimmon-to-gguf.txt +++ /dev/null @@ -1,2 +0,0 @@ --r requirements-convert.txt -torch==2.1.1 diff --git a/requirements-convert.txt b/requirements-convert.txt deleted file mode 100644 index 1a116256671..00000000000 --- a/requirements-convert.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy==1.24.4 -sentencepiece==0.1.98 -transformers>=4.34.0 -gguf>=0.1.0 -protobuf>=4.21.0 diff --git a/requirements-convert-hf-to-gguf.txt b/requirements-hf-to-gguf.txt similarity index 54% rename from requirements-convert-hf-to-gguf.txt rename to requirements-hf-to-gguf.txt index 4d00b196661..f4600539e27 100644 --- a/requirements-convert-hf-to-gguf.txt +++ b/requirements-hf-to-gguf.txt @@ -1,3 +1,3 @@ --r requirements-convert.txt +-r requirements.txt torch==2.1.1 transformers==4.35.2 diff --git a/requirements.txt b/requirements.txt index da4f3f9a874..1a116256671 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,5 @@ -# These requirements include all dependencies for all top-level python scripts -# for llama.cpp. Avoid adding packages here directly. -# -# Package versions must stay compatible across all top-level python scripts. -# - --r requirements-convert.txt - --r requirements-convert-hf-to-gguf.txt --r requirements-convert-lora-to-ggml.txt --r requirements-convert-persimmon-to-gguf.txt +numpy==1.24.4 +sentencepiece==0.1.98 +transformers>=4.34.0 +gguf>=0.1.0 +protobuf>=4.21.0 diff --git a/tests/.gitignore b/tests/.gitignore deleted file mode 100644 index 59be43b9994..00000000000 --- a/tests/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!*.* diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 81a02dae92d..e42237c7a2e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,20 +8,14 @@ endfunction() function(llama_test_executable name source) get_filename_component(TEST_TARGET ${source} NAME_WE) add_test(NAME ${name} COMMAND $ ${ARGN}) - set_property(TEST ${name} PROPERTY LABELS "main") endfunction() function(llama_build_and_test_executable source) - llama_build_and_test_executable_with_label(${source} "main") -endfunction() - -function(llama_build_and_test_executable_with_label source label) get_filename_component(TEST_TARGET ${source} NAME_WE) add_executable(${TEST_TARGET} ${source}) install(TARGETS ${TEST_TARGET} RUNTIME) target_link_libraries(${TEST_TARGET} PRIVATE llama common) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) - set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label}) endfunction() # llama_build_and_test_executable(test-double-float.cpp) # SLOW @@ -57,8 +51,6 @@ llama_build_and_test_executable(test-backend-ops.cpp) llama_build_and_test_executable(test-rope.cpp) -llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model") - # dummy executable - not installed get_filename_component(TEST_TARGET test-c.c NAME_WE) add_executable(${TEST_TARGET} test-c.c) diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp deleted file mode 100644 index 509f3e8e031..00000000000 --- a/tests/test-model-load-cancel.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "llama.h" - -#include -#include -#include - -int main(void) { - const char * models_to_try[] = { - // Same default as example/main for local use - "./models/7B/ggml-model-f16.gguf", - // Models for ./ci/run.sh - "./models-mnt/open-llama/3B-v2/ggml-model-q2_k.gguf", - "./models-mnt/open-llama/7B-v2/ggml-model-q2_k.gguf", - }; - - const char * chosen_model; - for (size_t i = 0; i < sizeof(models_to_try) / sizeof(models_to_try[0]); i++) { - const auto * model = models_to_try[i]; - - auto * file = fopen(model, "r"); - if (file == nullptr) { - continue; - } - - chosen_model = model; - fprintf(stderr, "using '%s'\n", model); - fclose(file); - } - - if (chosen_model == nullptr) { - fprintf(stderr, "no model found\n"); - return EXIT_FAILURE; - } - - llama_backend_init(false); - auto params = llama_model_params{}; - params.use_mmap = false; - params.progress_callback = [](float progress, void * ctx){ - (void) ctx; - return progress > 0.05; - }; - - auto * model = llama_load_model_from_file(chosen_model, params); - llama_backend_free(); - return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE; -} From 5f2ee1c938d121f78b3b5dc8230511118efaeb4e Mon Sep 17 00:00:00 2001 From: crasm Date: Fri, 22 Dec 2023 01:00:11 -0500 Subject: [PATCH 27/27] Redo changes for cancelling model load --- llama.cpp | 46 +++++++++++++++++++++++++++++++++------------- llama.h | 6 ++++-- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index d6c192441fb..cb0546c952d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2372,7 +2372,8 @@ struct llama_model_loader { } } - void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { + // Returns false if cancelled by progress_callback + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { size_t size_data = 0; for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { @@ -2404,7 +2405,9 @@ struct llama_model_loader { GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { - progress_callback((float) size_done / size_data, progress_callback_user_data); + if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + return false; + } } const size_t offs = file_offset(ggml_get_name(cur)); @@ -2466,8 +2469,11 @@ struct llama_model_loader { } if (progress_callback) { - progress_callback(1.0f, progress_callback_user_data); + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); } + return true; } }; @@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } } -static void llm_load_tensors( +// Returns false if cancelled by progress_callback +static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, @@ -3722,16 +3729,20 @@ static void llm_load_tensors( model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); } - ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } model.mapping = std::move(ml.mapping); // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = ggml_time_us() - model.t_start_us; + return true; } -static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { +// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback +static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { try { llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); @@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con if (params.vocab_only) { LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); - return true; + return 0; } - llm_load_tensors( + if (!llm_load_tensors( ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data - ); + )) { + return -2; + } } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); - return false; + return -1; } - return true; + return 0; } // @@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file( LLAMA_LOG_INFO("\n"); } } + return true; }; } - if (!llama_model_load(path_model, *model, params)) { - LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + int status = llama_model_load(path_model, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } delete model; return nullptr; } diff --git a/llama.h b/llama.h index 0be4b1337b9..af76bae2d2a 100644 --- a/llama.h +++ b/llama.h @@ -127,7 +127,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef void (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void *ctx); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -180,7 +180,9 @@ extern "C" { int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) - // called with a progress value between 0 and 1, pass NULL to disable + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. llama_progress_callback progress_callback; // context pointer passed to the progress callback