Merge branch 'master' into Nexes_CQ_10

Nexesenex · Nexesenex · commit 35ce3f64141d · 2024-10-05T17:03:34.000+02:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -19,6 +19,11 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  contents: write # for creating release
+
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml
@@ -3,6 +3,11 @@ on:
   schedule:
     - cron: "42 0 * * *"
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  issues: write
+
 jobs:
   close-issues:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
@@ -21,6 +21,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
   nix-build-aarch64:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
@@ -12,6 +12,13 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
   cancel-in-progress: true
 
+# Fine-grant permission
+# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+permissions:
+  # https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
+  id-token: write
+  contents: read
+
 jobs:
   nix-eval:
     strategy:
diff --git a/README.md b/README.md
@@ -169,6 +169,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
 - [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
+- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 
diff --git a/ci/run.sh b/ci/run.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@@ -751,7 +751,8 @@ function gg_run_rerank_tiny {
 
     model_f16="${path_models}/ggml-model-f16.gguf"
 
-    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16}  -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
@@ -774,7 +775,7 @@ function gg_run_rerank_tiny {
 
     check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
     check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
-    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
 
     set +e
 }
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -911,7 +911,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_sparam());
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
         [](gpt_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
diff --git a/common/common.cpp b/common/common.cpp
@@ -838,6 +838,31 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         return iparams;
     }
 
+    if (params.reranking) {
+        bool ok = true;
+
+        if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  BOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
+            LOG_WRN("%s: warning: model does not have a  SEP token, reranking will not work\n", __func__);
+            ok = false;
+        }
+
+        if (!ok) {
+            llama_free_model(model);
+
+            return iparams;
+        }
+    }
+
     auto cparams = llama_context_params_from_gpt_params(params);
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
@@ -855,6 +880,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (cvec.n_embd == -1) {
             llama_free(lctx);
             llama_free_model(model);
+
             return iparams;
         }
 
@@ -867,6 +893,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         if (err) {
             llama_free(lctx);
             llama_free_model(model);
+
             return iparams;
         }
     }
@@ -889,7 +916,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         llama_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
+    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
         params.sparams.ignore_eos = false;
     }
@@ -930,6 +957,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     iparams.model   = model;
     iparams.context = lctx;
+
     return iparams;
 }
 
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -100,7 +100,7 @@ The project is under active development, and we are [looking for feedback and co
 | Argument | Explanation |
 | -------- | ----------- |
 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
-| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2027,15 +2027,15 @@ struct server_context {
                                 continue;
                             }
 
-                            // prompt: <s>query</s><s>doc</s>
+                            // prompt: [BOS]query[EOS][SEP]doc[EOS]
                             prompt_tokens.clear();
                             prompt_tokens.push_back(llama_token_bos(model));
                             {
                                 const auto part = tokenize(slot.prompt[0], false);
                                 prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
                             }
                             prompt_tokens.push_back(llama_token_eos(model));
-                            prompt_tokens.push_back(llama_token_bos(model));
+                            prompt_tokens.push_back(llama_token_sep(model));
                             {
                                 const auto part = tokenize(slot.prompt[1], false);
                                 prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
@@ -24,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
   Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
 
     // optional: create a worst-case graph and reserve the buffers to avoid reallocations
     ggml_gallocr_reserve(galloc, build_graph(max_batch));
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -2448,6 +2448,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     GGML_UNUSED(backend);
 }
 
+#ifdef USE_CUDA_GRAPH
 static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
     graph_node_properties->node_address = node->data;
     graph_node_properties->node_op = node->op;
@@ -2491,6 +2492,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
     }
     return true;
 }
+#endif
 
 static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -3258,7 +3258,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 }
 
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
 
@@ -3340,7 +3340,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 // buffer from ptr
 
 ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
-    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+    struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
 
     ctx->all_data = data;
     ctx->all_size = size;
diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
@@ -1 +1 @@
-e5c233e5edbfcfa1d808b9293de9065035c40751
+0d7ecbbe536dc84240f646e0ec0a712251377f34
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
@@ -40,17 +40,17 @@ struct llama_vocab {
     id special_bos_id  = 1;
     id special_eos_id  = 2;
     id special_unk_id  = 0;
-    id special_sep_id  = -1;
-    id special_pad_id  = -1;
-    id special_cls_id  = -1;
-    id special_mask_id = -1;
+    id special_sep_id  = LLAMA_TOKEN_NULL;
+    id special_pad_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_mask_id = LLAMA_TOKEN_NULL;
 
     id linefeed_id       = 13;
-    id special_prefix_id = -1;
-    id special_suffix_id = -1;
-    id special_middle_id = -1;
-    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
-    id special_eom_id    = -1;
+    id special_prefix_id = LLAMA_TOKEN_NULL;
+    id special_suffix_id = LLAMA_TOKEN_NULL;
+    id special_middle_id = LLAMA_TOKEN_NULL;
+    id special_eot_id    = LLAMA_TOKEN_NULL; // TODO: move above after "eos_id", and here add "file separator" token
+    id special_eom_id    = LLAMA_TOKEN_NULL;
 
     // set of all tokens that cause "end of generation"
     std::set<id> special_eog_ids;
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -2412,7 +2412,7 @@ struct llama_hparams {
 
     // needed by encoder-decoder models (e.g. T5, FLAN-T5)
     // ref: https://github.com/ggerganov/llama.cpp/pull/8141
-    llama_token dec_start_token_id = -1;
+    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
 
     enum llama_pooling_type      pooling_type            = LLAMA_POOLING_TYPE_NONE;
     enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-#/bin/bash`
	`1`	`+#!/bin/bash`
`2`	`2`	`#`
`3`	`3`	`# sample usage:`
`4`	`4`	`#`
`@@ -751,7 +751,8 @@ function gg_run_rerank_tiny {`
`751`	`751`
`752`	`752`	`model_f16="${path_models}/ggml-model-f16.gguf"`
`753`	`753`
`754`		`- (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 \| tee -a $OUT/${ci}-rk-f16.log`
	`754`	`+ # for this model, the SEP token is "</s>"`
	`755`	`+ (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 \| tee -a $OUT/${ci}-rk-f16.log`
`755`	`756`
`756`	`757`	`# sample output`
`757`	`758`	`# rerank score 0: 0.029`
`@@ -774,7 +775,7 @@ function gg_run_rerank_tiny {`
`774`	`775`
`775`	`776`	`check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log \| grep "rerank score 0")" "0.00" "0.05" \| tee -a $OUT/${ci}-rk-f16.log`
`776`	`777`	`check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log \| grep "rerank score 1")" "0.00" "0.05" \| tee -a $OUT/${ci}-rk-f16.log`
`777`		`- check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log \| grep "rerank score 2")" "0.10" "0.15" \| tee -a $OUT/${ci}-rk-f16.log`
	`778`	`+ check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log \| grep "rerank score 2")" "0.10" "0.30" \| tee -a $OUT/${ci}-rk-f16.log`
`778`	`779`
`779`	`780`	`set +e`
`780`	`781`	`}`
Original file line number	Diff line number	Diff line change
`@@ -911,7 +911,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,`
`911`	`911`	`).set_sparam());`
`912`	`912`	`add_opt(llama_arg(`
`913`	`913`	`{"-s", "--seed"}, "SEED",`
`914`		`- format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),`
	`914`	`+ format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),`
`915`	`915`	`[](gpt_params & params, const std::string & value) {`
`916`	`916`	`params.sparams.seed = std::stoul(value);`
`917`	`917`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2027,15 +2027,15 @@ struct server_context {`
`2027`	`2027`	`continue;`
`2028`	`2028`	`}`
`2029`	`2029`
`2030`		`- // prompt: <s>query</s><s>doc</s>`
	`2030`	`+ // prompt: [BOS]query[EOS][SEP]doc[EOS]`
`2031`	`2031`	`prompt_tokens.clear();`
`2032`	`2032`	`prompt_tokens.push_back(llama_token_bos(model));`
`2033`	`2033`	`{`
`2034`	`2034`	`const auto part = tokenize(slot.prompt[0], false);`
`2035`	`2035`	`prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());`
`2036`	`2036`	`}`
`2037`	`2037`	`prompt_tokens.push_back(llama_token_eos(model));`
`2038`		`- prompt_tokens.push_back(llama_token_bos(model));`
	`2038`	`+ prompt_tokens.push_back(llama_token_sep(model));`
`2039`	`2039`	`{`
`2040`	`2040`	`const auto part = tokenize(slot.prompt[1], false);`
`2041`	`2041`	`prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());`
Original file line number	Diff line number	Diff line change
`@@ -2448,6 +2448,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {`
`2448`	`2448`	`GGML_UNUSED(backend);`
`2449`	`2449`	`}`
`2450`	`2450`
	`2451`	`+#ifdef USE_CUDA_GRAPH`
`2451`	`2452`	`static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {`
`2452`	`2453`	`graph_node_properties->node_address = node->data;`
`2453`	`2454`	`graph_node_properties->node_op = node->op;`
`@@ -2491,6 +2492,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra`
`2491`	`2492`	`}`
`2492`	`2493`	`return true;`
`2493`	`2494`	`}`
	`2495`	`+#endif`
`2494`	`2496`
`2495`	`2497`	`static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {`
`2496`	`2498`	`ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e5c233e5edbfcfa1d808b9293de9065035c40751`
	`1`	`+0d7ecbbe536dc84240f646e0ec0a712251377f34`