Initial implementation of a sequence repetition penalty

KerfuffleV2 · KerfuffleV2 · commit b8623fc73d99 · 2023-08-12T14:30:45.000-06:00
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -250,6 +250,36 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.presence_penalty = std::stof(argv[i]);
+        } else if (arg == "--seqrep-last-n") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_last_n = std::stoi(argv[i]);
+        } else if (arg == "--seqrep-min-len") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_min_len = std::stoi(argv[i]);
+        } else if (arg == "--seqrep-tolerance") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_tolerance = std::stoi(argv[i]);
+        } else if (arg == "--seqrep-ppenalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_ppenalty = std::stof(argv[i]);
+        } else if (arg == "--seqrep-lpenalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_lpenalty = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -556,6 +586,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
     fprintf(stdout, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
     fprintf(stdout, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stdout, "  --seqrep-last-n N     last n tokens to consider for sequence penalizing (default: %d, 0 = disabled, -1 = ctx_size)\n", params.seqrep_last_n);
+    fprintf(stdout, "  --seqrep-min-len N    minimum matching sequence length (default: %d, < 2 = disabled)\n", params.seqrep_min_len);
+    fprintf(stdout, "  --seqrep-tolerance N  tolerance for fuzzy matching sequences (default: %d, 0 = disabled)\n", params.seqrep_tolerance);
+    fprintf(stdout, "  --seqrep-ppenalty N   presence penalty for tokens that can continue a sequence (default: %f, 0.0 = disabled)\n", params.seqrep_ppenalty);
+    fprintf(stdout, "  --seqrep-lpenalty N   penalty for tokens that can continue a sequence, multiplied by length (default: %f, 0.0 = disabled)\n", params.seqrep_lpenalty);
     fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
     fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
diff --git a/examples/common.h b/examples/common.h
@@ -44,6 +44,11 @@ struct gpt_params {
     int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   frequency_penalty = 0.00f; // 0.0 = disabled
     float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t seqrep_last_n     = 256;   // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    int32_t seqrep_min_len    = 0;     // minimum sequence length to match (< 2 is disabled)
+    int32_t seqrep_tolerance  = 0;     // tolerance for fuzzy sequence matching (0 = disabled)
+    float   seqrep_ppenalty   = 0.0f;  // flat penalty (0.0 = disabled)
+    float   seqrep_lpenalty   = 0.0f;  // stacking penalty based on length (0.0 = disabled)
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -334,8 +334,10 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, seqrep(last_n = %d, min_len = %d, tolerance = %d, ppenalty = %f, lpenalty = %f), top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty,
+            params.seqrep_last_n, params.seqrep_min_len, params.seqrep_tolerance, params.seqrep_ppenalty, params.seqrep_lpenalty,
+            params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
     fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     fprintf(stderr, "\n\n");
 
@@ -552,6 +554,7 @@ int main(int argc, char ** argv) {
             const float   typical_p       = params.typical_p;
             const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
             const float   repeat_penalty  = params.repeat_penalty;
+            const int32_t seqrep_last_n   = params.seqrep_last_n < 0 ? n_ctx : params.seqrep_last_n;
             const float   alpha_presence  = params.presence_penalty;
             const float   alpha_frequency = params.frequency_penalty;
             const int     mirostat        = params.mirostat;
@@ -597,6 +600,11 @@ int main(int argc, char ** argv) {
                 llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                     last_n_repeat, alpha_frequency, alpha_presence);
+                auto seqrep_last_n_repeat = std::min(std::min((int)last_n_tokens.size(), seqrep_last_n), n_ctx);
+                llama_sample_seqrep_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - seqrep_last_n_repeat,
+                    seqrep_last_n_repeat, params.seqrep_min_len, params.seqrep_tolerance,
+                    params.seqrep_ppenalty, params.seqrep_lpenalty);
                 if (!penalize_nl) {
                     logits[llama_token_nl()] = nl_logit;
                 }
diff --git a/llama.cpp b/llama.cpp
@@ -2651,6 +2651,84 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
     }
 }
 
+// Internal helper function for sequence matching.
+static size_t llama_seqrep_find_match(const llama_token * last_tokens_p, const size_t last_tokens_size, int offset, const size_t min_length, int tolerance) {
+
+    if (min_length < 2 || last_tokens_size < min_length || (size_t)offset < min_length - 1) {
+        return 0;
+    }
+
+    int tail_offset = last_tokens_size - 1;
+    if (offset >= tail_offset) {
+        return 0;
+    }
+    int matches = 0, wildcard_matches = 0;
+    while (offset >= 0) {
+        if (last_tokens_p[offset] == last_tokens_p[tail_offset]) {
+            offset--;
+            tail_offset--;
+            matches += 1 + wildcard_matches;
+            wildcard_matches = 0;
+            continue;
+        }
+        if (tolerance < 1 || (offset == 0 && tail_offset == 0)) {
+            break;
+        }
+        tolerance--;
+        if (offset > 0 && last_tokens_p[offset - 1] == last_tokens_p[tail_offset]) {
+            offset--;
+        } else if (tail_offset > offset + 1 && last_tokens_p[offset] == last_tokens_p[tail_offset - 1]) {
+            tail_offset--;
+        } else {
+            // A tolerance charge can count as a match, but only if we can find a
+            // real match before the search is terminated.
+            wildcard_matches++;
+            offset--;
+            tail_offset--;
+        }
+    }
+    return matches;
+}
+
+void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty) {
+    if (min_length < 2 || last_tokens_size <= min_length ||
+        (flat_penalty == 0.0f && length_penalty == 0.0f)) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // This will hold a map of token ids that can continue the sequence with its max seen sequence length.
+    std::unordered_map<llama_token, size_t> penalize_tokens;
+
+    for (size_t offset = last_tokens_size - 2; offset >= min_length - 1; offset--) {
+        const size_t matched_length =
+            llama_seqrep_find_match(last_tokens_p, last_tokens_size, offset, min_length, tolerance);
+        if (matched_length < min_length) {
+            continue;
+        }
+
+        // The token one past where we started trying to match is the one that could continue
+        // the previously observed sequence.
+        llama_token penalize_token = last_tokens_p[offset + 1];
+
+        auto pt_iter = penalize_tokens.find(penalize_token);
+        if (pt_iter == penalize_tokens.end()) {
+            penalize_tokens[penalize_token] = matched_length;
+        } else {
+            penalize_tokens[penalize_token] = pt_iter->second + matched_length;
+        }
+    }
+    for (const auto it : penalize_tokens) {
+        candidates->data[it.first].logit -=
+            float(it.second) * length_penalty + float(it.second > 0) * flat_penalty;
+    }
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
 void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
     assert(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
diff --git a/llama.h b/llama.h
@@ -407,6 +407,9 @@ extern "C" {
     /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
     LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
 
+    /// @details himom
+    LLAMA_API void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty);
+
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
     /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.