From 89640b00a1e199a53ceb2062047505d175841d4f Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 17:51:27 +0500
Subject: [PATCH 01/30] Initial XTC commit

Adds XTC sampler, not activated by default, but recommended settings by default.
---
 common/arg.cpp         | 21 +++++++++++
 common/build-info.cpp  |  4 ++
 common/common.cpp      |  3 ++
 common/common.h        |  4 ++
 common/sampling.cpp    | 13 +++++--
 include/llama.h        |  3 ++
 src/llama-sampling.cpp | 83 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 128 insertions(+), 3 deletions(-)
 create mode 100644 common/build-info.cpp

diff --git a/common/arg.cpp b/common/arg.cpp
index 2a85ad8454908..ffe2518e039f5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -966,6 +966,27 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.sparams.min_p = std::stof(value);
         }
     ).set_sparam());
+    add_opt(llama_arg(
+        {"--xtc-p"}, "N",
+        format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.xtc_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--xtc-t"}, "N",
+        format("xtc threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.xtc_t = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--xtc-t-max"}, "N",
+        format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t_max),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.xtc_t_max = std::stof(value);
+        }
+    ).set_sparam());
     add_opt(llama_arg(
         {"--tfs"}, "N",
         format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
diff --git a/common/build-info.cpp b/common/build-info.cpp
new file mode 100644
index 0000000000000..d839c9babffd9
--- /dev/null
+++ b/common/build-info.cpp
@@ -0,0 +1,4 @@
+int LLAMA_BUILD_NUMBER = 0;
+char const *LLAMA_COMMIT = "unknown";
+char const *LLAMA_COMPILER = "cc (GCC) 14.1.0";
+char const *LLAMA_BUILD_TARGET = "x86_64-w64-mingw32";
diff --git a/common/common.cpp b/common/common.cpp
index a0611f3d1734b..33355fd0a0fad 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2060,6 +2060,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
+    fprintf(stream, "xtc_p: %f # default: 0.0\n", sparams.xtc_p);
+    fprintf(stream, "xtc_t: %f # default: 0.0\n", sparams.xtc_t);
+    fprintf(stream, "xtc_t_max: %f # default: 0.0\n", sparams.xtc_t_max);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 8b84cf9ad45ee..a4bb13afdbb67 100644
--- a/common/common.h
+++ b/common/common.h
@@ -90,6 +90,7 @@ enum gpt_sampler_type {
     GPT_SAMPLER_TYPE_TFS_Z       = 4,
     GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
     GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+    GPT_SAMPLER_TYPE_XTC         = 7,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -108,6 +109,9 @@ struct gpt_sampler_params {
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
+    float   xtc_p             = 0.50f; // 0.0 = disabled
+    float   xtc_t             = 0.10f; // 1.0 = disabled
+    float   xtc_t_max         = 1.00f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 3dc7f112094e6..fd77e7bf60c5e 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,10 +130,10 @@ std::string gpt_sampler_params::print() const {
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_p = %.3f, xtc_t = %.3f, xtc_t_max = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, typ_p, temp,
+            top_k, tfs_z, top_p, min_p, xtc_p, xtc_t, xtc_t_max, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -184,6 +184,9 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                     case GPT_SAMPLER_TYPE_MIN_P:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
+                    case GPT_SAMPLER_TYPE_XTC:
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep));
+                        break;
                     case GPT_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
                         break;
@@ -372,6 +375,7 @@ char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
         case GPT_SAMPLER_TYPE_TOP_P:       return 'p';
         case GPT_SAMPLER_TYPE_MIN_P:       return 'm';
         case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
+        case GPT_SAMPLER_TYPE_XTC:         return 'x';
         default : return '?';
     }
 }
@@ -384,6 +388,7 @@ std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
         case GPT_SAMPLER_TYPE_TOP_P:       return "top_p";
         case GPT_SAMPLER_TYPE_MIN_P:       return "min_p";
         case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
+        case GPT_SAMPLER_TYPE_XTC:         return "xtc";
         default : return "";
     }
 }
@@ -396,6 +401,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
         { "min_p",       GPT_SAMPLER_TYPE_MIN_P },
         { "tfs_z",       GPT_SAMPLER_TYPE_TFS_Z },
         { "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
+        { "xtc",         GPT_SAMPLER_TYPE_XTC },
     };
 
     // since samplers names are written multiple ways
@@ -441,7 +447,8 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & c
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P),       GPT_SAMPLER_TYPE_TOP_P },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P),       GPT_SAMPLER_TYPE_MIN_P },
-        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE },
+        { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_XTC),         GPT_SAMPLER_TYPE_XTC }
     };
 
     std::vector<gpt_sampler_type> samplers;
diff --git a/include/llama.h b/include/llama.h
index 7cae1bbe2e5b8..ae8e0960d98a9 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1093,6 +1093,9 @@ extern "C" {
     /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
     LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
 
+    /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t, float     t_max, size_t min_keep);
+
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
     /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index e255a8fc4fd54..416a973f64fce 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1059,6 +1059,89 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
     };
 }
 
+// xtc
+
+struct llama_sampler_xtc {
+    const float  probability;
+    const float  threshold;
+    const float  threshold_max;
+    const size_t min_keep;
+};
+
+static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
+    return "xtc";
+}
+
+static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    const auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+
+    if (ctx->probability <= 0.0f || ctx->threshold <= 0.0f || cur_p->size <= 1 || ctx->min_keep <= 2) {
+        return;
+    }
+
+    std::random_device rd;
+    float chance = (float)(rd()%100)/100;
+    if (chance > ctx->probability) return;
+    // in case it's not sorted/recalculated yet
+    llama_sampler_softmax_impl(cur_p);
+
+    int removed = 0;
+    // going through all candidates from back to front, easier to keep the last of probables
+    for (int i = (cur_p->size - 1); i >= 0; --i) {
+        if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
+            if (removed == 0 || chance <= ctx->probability) {
+                ++removed;
+                if (removed >= 2) {
+                    // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
+                    cur_p->data[i].logit = -999.0f;
+                    chance = (float)(rd()%100)/100;
+                }
+            }
+        }
+    }
+
+    if (removed >= 2) {
+        // sorting with new logits, ex-last probable will be the first anyway
+        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        cur_p->sorted = true;
+
+        // resizing now that penalized tokens are at the back
+        cur_p->size = cur_p->size - removed + 1;
+    }
+}
+
+static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
+    return llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep);
+}
+
+static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
+    delete (llama_sampler_xtc *) smpl->ctx;
+}
+
+static struct llama_sampler_i llama_sampler_xtc_i = {
+    /* .name   = */ llama_sampler_xtc_name,
+    /* .accept = */ nullptr,
+    /* .apply  = */ llama_sample_xtc_apply,
+    /* .reset  = */ nullptr,
+    /* .clone  = */ llama_sampler_xtc_clone,
+    /* .free   = */ llama_sampler_xtc_free,
+};
+
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep) {
+    return new llama_sampler {
+        /* .iface = */ &llama_sampler_xtc_i,
+        /* .ctx   = */ new llama_sampler_xtc {
+            /* .probability   = */ p,
+            /* .threshold     = */ t,
+            /* .threshold_max = */ t_max,
+            /* .min_keep      = */ min_keep,
+        },
+    };
+}
+
 // mirostat
 
 struct llama_sampler_mirostat {

From 9455194056bd993815019d6a5b10b9f30af73239 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 17:53:13 +0500
Subject: [PATCH 02/30] Cleanup

---
 common/build-info.cpp | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 common/build-info.cpp

diff --git a/common/build-info.cpp b/common/build-info.cpp
deleted file mode 100644
index d839c9babffd9..0000000000000
--- a/common/build-info.cpp
+++ /dev/null
@@ -1,4 +0,0 @@
-int LLAMA_BUILD_NUMBER = 0;
-char const *LLAMA_COMMIT = "unknown";
-char const *LLAMA_COMPILER = "cc (GCC) 14.1.0";
-char const *LLAMA_BUILD_TARGET = "x86_64-w64-mingw32";

From db54ac5df43837764126f2afed9d0541bb8d70fc Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 18:30:46 +0500
Subject: [PATCH 03/30] Simplified chances calculation

To be more inline with the original implementation, chance is calculated once at the beginning.
---
 src/llama-sampling.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 416a973f64fce..6925618d6d63d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1089,18 +1089,15 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     // going through all candidates from back to front, easier to keep the last of probables
     for (int i = (cur_p->size - 1); i >= 0; --i) {
         if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
-            if (removed == 0 || chance <= ctx->probability) {
-                ++removed;
-                if (removed >= 2) {
-                    // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
-                    cur_p->data[i].logit = -999.0f;
-                    chance = (float)(rd()%100)/100;
-                }
+            ++removed;
+            if (removed > 1) {
+                // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
+                cur_p->data[i].logit = -999.0f;
             }
         }
     }
 
-    if (removed >= 2) {
+    if (removed > 1) {
         // sorting with new logits, ex-last probable will be the first anyway
         std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
             return a.logit > b.logit;

From 41e16654bdd4c85f7ea7e1c725862c5ea60db2c3 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 21:34:31 +0500
Subject: [PATCH 04/30] First fixes by comments

Still need to look into sorting
---
 common/arg.cpp         |  2 +-
 common/common.h        |  2 +-
 src/llama-sampling.cpp | 22 ++++++++++++++--------
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index ffe2518e039f5..3c81010913503 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -975,7 +975,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_sparam());
     add_opt(llama_arg(
         {"--xtc-t"}, "N",
-        format("xtc threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t),
+        format("xtc threshold (default: %.1f, 0.0 or 1.0 = disabled)", (double)params.sparams.xtc_t),
         [](gpt_params & params, const std::string & value) {
             params.sparams.xtc_t = std::stof(value);
         }
diff --git a/common/common.h b/common/common.h
index a4bb13afdbb67..7851b4bc455a9 100644
--- a/common/common.h
+++ b/common/common.h
@@ -110,7 +110,7 @@ struct gpt_sampler_params {
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
     float   xtc_p             = 0.50f; // 0.0 = disabled
-    float   xtc_t             = 0.10f; // 1.0 = disabled
+    float   xtc_t             = 0.10f; // 0.0 or 1.0 = disabled
     float   xtc_t_max         = 1.00f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 6925618d6d63d..e0d5fdaa9deb1 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1075,37 +1075,43 @@ static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/
 static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_xtc *) smpl->ctx;
 
-    if (ctx->probability <= 0.0f || ctx->threshold <= 0.0f || cur_p->size <= 1 || ctx->min_keep <= 2) {
+    if (ctx->probability <= 0.0f 
+        || ctx->threshold <= 0.0f 
+        || ctx->threshold >= 1.0f 
+        || ctx->threshold_max <= 0.0f 
+        || ctx->threshold_max <= ctx->threshold 
+        || cur_p->size <= 2 
+        || ctx->min_keep <= 2) {
         return;
     }
 
     std::random_device rd;
-    float chance = (float)(rd()%100)/100;
+    float chance = (float)(rd()%100 - 1)/100;
     if (chance > ctx->probability) return;
+
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
 
-    int removed = 0;
+    int found = 0;
     // going through all candidates from back to front, easier to keep the last of probables
     for (int i = (cur_p->size - 1); i >= 0; --i) {
         if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
-            ++removed;
-            if (removed > 1) {
+            ++found;
+            if (found > 1) {
                 // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
                 cur_p->data[i].logit = -999.0f;
             }
         }
     }
 
-    if (removed > 1) {
+    if (found > 1) {
         // sorting with new logits, ex-last probable will be the first anyway
         std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
             return a.logit > b.logit;
         });
-        cur_p->sorted = true;
 
         // resizing now that penalized tokens are at the back
-        cur_p->size = cur_p->size - removed + 1;
+        cur_p->size = cur_p->size - found + 1;
     }
 }
 

From f2a2a618a2311b1eea091180ea4d29303dbfa8d4 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 21:42:54 +0500
Subject: [PATCH 05/30] Fixed trailing backspaces

---
 src/llama-sampling.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index e0d5fdaa9deb1..9858f0a3dbb2a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1075,12 +1075,12 @@ static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/
 static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     const auto * ctx = (llama_sampler_xtc *) smpl->ctx;
 
-    if (ctx->probability <= 0.0f 
-        || ctx->threshold <= 0.0f 
-        || ctx->threshold >= 1.0f 
-        || ctx->threshold_max <= 0.0f 
-        || ctx->threshold_max <= ctx->threshold 
-        || cur_p->size <= 2 
+    if (ctx->probability <= 0.0f
+        || ctx->threshold <= 0.0f
+        || ctx->threshold >= 1.0f
+        || ctx->threshold_max <= 0.0f
+        || ctx->threshold_max <= ctx->threshold
+        || cur_p->size <= 2
         || ctx->min_keep <= 2) {
         return;
     }

From 4f8e55b170df0df97355d4d9702d2b7e2bf04652 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 22:38:12 +0500
Subject: [PATCH 06/30] Fixed RNG to be reproduceable

Thanks to @slaren for directions
---
 common/sampling.cpp    |  2 +-
 src/llama-sampling.cpp | 52 +++++++++++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index fd77e7bf60c5e..0c35044e9b518 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -185,7 +185,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
                     case GPT_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep, params.seed));
                         break;
                     case GPT_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 9858f0a3dbb2a..4372b40c37ef0 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1062,10 +1062,16 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
 // xtc
 
 struct llama_sampler_xtc {
-    const float  probability;
-    const float  threshold;
-    const float  threshold_max;
-    const size_t min_keep;
+    const float    probability;
+    const float    threshold;
+    const float    threshold_max;
+    const size_t   min_keep;
+
+    const uint32_t seed;
+    uint32_t       seed_cur;
+    float          chance;
+
+    std::mt19937 rng;
 };
 
 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -1084,10 +1090,8 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         || ctx->min_keep <= 2) {
         return;
     }
-
-    std::random_device rd;
-    float chance = (float)(rd()%100 - 1)/100;
-    if (chance > ctx->probability) return;
+    // chance is calculated on init and on each reset
+    if (ctx->chance > ctx->probability) return;
 
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
@@ -1117,23 +1121,45 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
 static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
-    return llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep);
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep, ctx->seed);
+
+    // copy the state
+    {
+        auto * result_ctx = (llama_sampler_xtc *) result->ctx;
+
+        result_ctx->rng = ctx->rng;
+    }
+
+    return result;
 }
 
 static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
     delete (llama_sampler_xtc *) smpl->ctx;
 }
 
+static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
+
+    std::uniform_real_distribution<> distance(0.0, 1.0);
+    ctx->chance = distance(ctx->rng);
+}
+
 static struct llama_sampler_i llama_sampler_xtc_i = {
     /* .name   = */ llama_sampler_xtc_name,
     /* .accept = */ nullptr,
     /* .apply  = */ llama_sample_xtc_apply,
-    /* .reset  = */ nullptr,
+    /* .reset  = */ llama_sampler_xtc_reset,
     /* .clone  = */ llama_sampler_xtc_clone,
     /* .free   = */ llama_sampler_xtc_free,
 };
 
-struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep) {
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep, uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
+    std::uniform_real_distribution<> distance(0.0, 1.0);
+    auto rng = std::mt19937(seed_cur);
+    float chance = distance(rng);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_xtc_i,
         /* .ctx   = */ new llama_sampler_xtc {
@@ -1141,6 +1167,10 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, siz
             /* .threshold     = */ t,
             /* .threshold_max = */ t_max,
             /* .min_keep      = */ min_keep,
+            /* .seed          = */ seed,
+            /* .seed_cur      = */ seed_cur,
+            /* .chance        = */ chance,
+            /* .rng           = */ rng,
         },
     };
 }

From 6d94ba2e5894030f3be44bab7a225d81598f659d Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 22:51:04 +0500
Subject: [PATCH 07/30] Fixed forgotten header

---
 include/llama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index ae8e0960d98a9..b9bceb0bea769 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1094,7 +1094,7 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
 
     /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
-    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t, float     t_max, size_t min_keep);
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t, float     t_max, size_t min_keep, uint32_t seed);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.

From 49cd2118e0f97d58f13d6a6dad091a288d65aba7 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 23:35:47 +0500
Subject: [PATCH 08/30] Moved `min_keep`

Moved from conditions to a simple check at the end.
---
 src/llama-sampling.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 4372b40c37ef0..d7a18e70ebf6d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1086,8 +1086,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         || ctx->threshold >= 1.0f
         || ctx->threshold_max <= 0.0f
         || ctx->threshold_max <= ctx->threshold
-        || cur_p->size <= 2
-        || ctx->min_keep <= 2) {
+        || cur_p->size <= 2) {
         return;
     }
     // chance is calculated on init and on each reset
@@ -1116,6 +1115,8 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
         // resizing now that penalized tokens are at the back
         cur_p->size = cur_p->size - found + 1;
+
+        if (cur_p->size < ctx->min_keep) cur_p->size = ctx->min_keep;
     }
 }
 

From 74f657cc2460caf3a9ca1ce467ec4866b3b8ecff Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 4 Oct 2024 23:47:19 +0500
Subject: [PATCH 09/30] Fixed broken randomization

Thanks to @slaren for explanation
---
 src/llama-sampling.cpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index d7a18e70ebf6d..8349b3162fe3c 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1069,9 +1069,8 @@ struct llama_sampler_xtc {
 
     const uint32_t seed;
     uint32_t       seed_cur;
-    float          chance;
 
-    std::mt19937 rng;
+    std::mt19937   rng;
 };
 
 static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -1079,7 +1078,7 @@ static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/
 }
 
 static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-    const auto * ctx = (llama_sampler_xtc *) smpl->ctx;
+    auto * ctx = (llama_sampler_xtc *) smpl->ctx;
 
     if (ctx->probability <= 0.0f
         || ctx->threshold <= 0.0f
@@ -1089,8 +1088,10 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         || cur_p->size <= 2) {
         return;
     }
-    // chance is calculated on init and on each reset
-    if (ctx->chance > ctx->probability) return;
+
+    std::uniform_real_distribution<float> distance(0.0f, 1.0f);
+    float chance = distance(ctx->rng);
+    if (chance > ctx->probability) return;
 
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
@@ -1142,9 +1143,6 @@ static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_xtc *) smpl->ctx;
     ctx->seed_cur = get_rng_seed(ctx->seed);
     ctx->rng.seed(ctx->seed_cur);
-
-    std::uniform_real_distribution<> distance(0.0, 1.0);
-    ctx->chance = distance(ctx->rng);
 }
 
 static struct llama_sampler_i llama_sampler_xtc_i = {
@@ -1158,9 +1156,6 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
 
 struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep, uint32_t seed) {
     auto seed_cur = get_rng_seed(seed);
-    std::uniform_real_distribution<> distance(0.0, 1.0);
-    auto rng = std::mt19937(seed_cur);
-    float chance = distance(rng);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_xtc_i,
         /* .ctx   = */ new llama_sampler_xtc {
@@ -1170,8 +1165,7 @@ struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, siz
             /* .min_keep      = */ min_keep,
             /* .seed          = */ seed,
             /* .seed_cur      = */ seed_cur,
-            /* .chance        = */ chance,
-            /* .rng           = */ rng,
+            /* .rng           = */ std::mt19937(seed_cur),
         },
     };
 }

From 63e60deda33bb06accddbe5cb2806e6d299c7f91 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Sat, 5 Oct 2024 23:27:36 +0500
Subject: [PATCH 10/30] Swapped sorting for a custom algorithm

Shifts tokens to remove the penalized ones, then puts the penalized at the back. Should make `min_keep` still viable.
---
 src/llama-sampling.cpp | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8349b3162fe3c..02db44be2a939 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1096,28 +1096,39 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
 
-    int found = 0;
+    std::vector<llama_token_data> cur;
+
+    int removed = -1; // to keep one out
+    int pos = 0;
+
     // going through all candidates from back to front, easier to keep the last of probables
     for (int i = (cur_p->size - 1); i >= 0; --i) {
         if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
-            ++found;
-            if (found > 1) {
+            ++removed;
+            if (removed > 0) {
                 // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
                 cur_p->data[i].logit = -999.0f;
+                cur.emplace_back(cur_p->data[i]);
+                pos = i;
             }
         }
     }
 
-    if (found > 1) {
-        // sorting with new logits, ex-last probable will be the first anyway
-        std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
-            return a.logit > b.logit;
-        });
+    if (removed > 0) {
+        size_t size_new = cur_p->size - removed;
+
+        // shift tokens to remove the penalized ones
+        for (size_t i = pos; i < size_new - pos; ++i) {
+            cur_p->data[i] = cur_p->data[i + removed];
+        }
 
-        // resizing now that penalized tokens are at the back
-        cur_p->size = cur_p->size - found + 1;
+        // put the prenalized ones at the back
+        for (size_t i = 0; i < cur.size(); ++i) {
+            cur_p->data[cur_p->size - (1 + i)] = cur[i];
+        }
 
-        if (cur_p->size < ctx->min_keep) cur_p->size = ctx->min_keep;
+        if (size_new < ctx->min_keep) size_new = ctx->min_keep;
+        cur_p->size = size_new;
     }
 }
 

From 39940e5fa36722bc0283b09397b932044fb83fb0 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Sun, 6 Oct 2024 16:15:12 +0500
Subject: [PATCH 11/30] Algorithm rework

1. Scan token from top till the first non-penalizable
2. Remove the last captured token (the least probable above threshold)
3. Shift all tokens to override the remaining penalizable
4. Penalize and put them at the the bottom.
---
 src/llama-sampling.cpp | 43 ++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 02db44be2a939..ba8eb5c74c528 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1096,37 +1096,40 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
 
-    std::vector<llama_token_data> cur;
-
-    int removed = -1; // to keep one out
+    std::vector<llama_token_data> top_tkns;
     int pos = 0;
 
-    // going through all candidates from back to front, easier to keep the last of probables
-    for (int i = (cur_p->size - 1); i >= 0; --i) {
-        if (cur_p->data[i].p >= ctx->threshold && cur_p->data[i].p <= ctx->threshold_max) {
-            ++removed;
-            if (removed > 0) {
-                // .logits are used for sorting and calculating .p in llama_sample_softmax_impl
-                cur_p->data[i].logit = -999.0f;
-                cur.emplace_back(cur_p->data[i]);
-                pos = i;
+    for (size_t i = 0; i < cur_p->size; ++i) {
+        if (cur_p->data[i].p >= ctx->threshold) {
+            if (cur_p->data[i].p <= ctx->threshold_max) {
+                top_tkns.emplace_back(cur_p->data[i]);
+                // capture position of the first penalizable
+                if (pos == -1) pos = i;
             }
-        }
+        } else break;
     }
 
-    if (removed > 0) {
-        size_t size_new = cur_p->size - removed;
+    // check if there are enough penalizable tokens
+    if (top_tkns.size() >= 2) {
+        // keep the least probable from top ones
+        top_tkns.pop_back();
+
+        // define new size
+        size_t to_remove = top_tkns.size();
+        size_t size_new = cur_p->size - to_remove;
 
-        // shift tokens to remove the penalized ones
+        // shift tokens starting from pos
         for (size_t i = pos; i < size_new - pos; ++i) {
-            cur_p->data[i] = cur_p->data[i + removed];
+            cur_p->data[i] = cur_p->data[i + to_remove];
         }
 
-        // put the prenalized ones at the back
-        for (size_t i = 0; i < cur.size(); ++i) {
-            cur_p->data[cur_p->size - (1 + i)] = cur[i];
+        // penalize top tokens and put them at the back
+        for (size_t i = 0; i < top_tkns.size(); ++i) {
+            top_tkns[i].logit = -999.0f;
+            cur_p->data[cur_p->size - (1 + i)] = top_tkns[i];
         }
 
+        // resize
         if (size_new < ctx->min_keep) size_new = ctx->min_keep;
         cur_p->size = size_new;
     }

From dbe9ef7783817d53026b49682d4cfb5a74f64672 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 8 Oct 2024 01:19:39 +0500
Subject: [PATCH 12/30] Added XTC to `test-sampling`

---
 tests/test-sampling.cpp | 40 ++++++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 6e021c4c70357..cd6c61ba0c20f 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -111,6 +111,28 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
     }
 }
 
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t, float t_max) {
+    const size_t n_vocab = probs.size();
+
+    std::vector<llama_token_data> cur;
+    cur.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        const float logit = logf(probs[token_id]);
+        cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    APPLY(llama_sampler_init_softmax(), &cur_p);
+    DUMP(&cur_p);
+    APPLY(llama_sampler_init_xtc(p, t, t_max, 0, 0), &cur_p);
+    DUMP(&cur_p);
+
+    GGML_ASSERT(cur_p.size == expected_probs.size());
+    for (size_t i = 0; i < cur_p.size; i++) {
+        GGML_ASSERT(fabs(cur_p.data[i].p - expected_probs[i]) < 1e-5);
+    }
+}
+
 static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
     const size_t n_vocab = probs.size();
 
@@ -279,12 +301,13 @@ static void test_perf() {
         data.emplace_back(llama_token_data{i, logit, 0.0f});
     }
 
-    BENCH(llama_sampler_init_top_k    (40),      data, 32);
-    BENCH(llama_sampler_init_top_p    (0.8f, 1), data, 32);
-    BENCH(llama_sampler_init_min_p    (0.2f, 1), data, 32);
-    BENCH(llama_sampler_init_tail_free(0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_typical  (0.5f, 1), data, 32);
-    BENCH(llama_sampler_init_softmax  (),        data, 32);
+    BENCH(llama_sampler_init_top_k    (40),                     data, 32);
+    BENCH(llama_sampler_init_top_p    (0.8f, 1),                data, 32);
+    BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
+    BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 0.8f, 1, 1), data, 32);
+    BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }
 
 int main(void) {
@@ -309,6 +332,11 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                         0.99f, 0.10f, 1.00f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.1f},                   0.99f, 0.10f, 0.35f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.1f},             0.99f, 0.10f, 0.25f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.2f, 0.1f},             0.99f, 0.20f, 0.35f);
+
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);

From 81a0c2603c967858156c722442358f338e3f4c0c Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 8 Oct 2024 18:38:43 +0500
Subject: [PATCH 13/30] Simplified algorithm and more tests

---
 src/llama-sampling.cpp  | 45 +++++++++++++----------------------------
 tests/test-sampling.cpp | 10 ++++++++-
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index ba8eb5c74c528..b152c329a7533 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1081,7 +1081,6 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     auto * ctx = (llama_sampler_xtc *) smpl->ctx;
 
     if (ctx->probability <= 0.0f
-        || ctx->threshold <= 0.0f
         || ctx->threshold >= 1.0f
         || ctx->threshold_max <= 0.0f
         || ctx->threshold_max <= ctx->threshold
@@ -1096,43 +1095,27 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
 
-    std::vector<llama_token_data> top_tkns;
-    int pos = 0;
+    int pos_first = -1;
+    int pos_last = 0;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p >= ctx->threshold) {
-            if (cur_p->data[i].p <= ctx->threshold_max) {
-                top_tkns.emplace_back(cur_p->data[i]);
-                // capture position of the first penalizable
-                if (pos == -1) pos = i;
-            }
-        } else break;
+        if (cur_p->data[i].p - ctx->threshold >= -1e-5) {
+            if (cur_p->data[i].p - ctx->threshold_max > 1e-3) pos_first = i;
+            pos_last = i;
+        } else {
+            break;
+        }
     }
 
-    // check if there are enough penalizable tokens
-    if (top_tkns.size() >= 2) {
-        // keep the least probable from top ones
-        top_tkns.pop_back();
+    size_t to_remove = pos_last - (1 + pos_first);
 
-        // define new size
-        size_t to_remove = top_tkns.size();
-        size_t size_new = cur_p->size - to_remove;
+    if (to_remove < ctx->min_keep || to_remove < 1) return;
 
-        // shift tokens starting from pos
-        for (size_t i = pos; i < size_new - pos; ++i) {
-            cur_p->data[i] = cur_p->data[i + to_remove];
-        }
-
-        // penalize top tokens and put them at the back
-        for (size_t i = 0; i < top_tkns.size(); ++i) {
-            top_tkns[i].logit = -999.0f;
-            cur_p->data[cur_p->size - (1 + i)] = top_tkns[i];
-        }
-
-        // resize
-        if (size_new < ctx->min_keep) size_new = ctx->min_keep;
-        cur_p->size = size_new;
+    for (size_t i = pos_first + 1; i < cur_p->size - to_remove + 1; ++i) {
+        cur_p->data[i] = cur_p->data[i + to_remove];
     }
+
+    cur_p->size = cur_p->size - to_remove;
 }
 
 static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index cd6c61ba0c20f..5716f7393ab70 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -285,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
     }
     const int64_t t_end = ggml_time_us();
     llama_sampler_free(cnstr);
-    printf("%-42s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-47s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))
@@ -332,10 +332,18 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
+    printf("XTC should:\n");
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                         0.99f, 0.10f, 1.00f);
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.1f},                   0.99f, 0.10f, 0.35f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                   0.99f, 0.20f, 1.00f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},             0.99f, 0.30f, 1.00f);
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.1f},             0.99f, 0.10f, 0.25f);
     test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.2f, 0.1f},             0.99f, 0.20f, 0.35f);
+    printf("XTC should not:\n");
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.10f, 0.15f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.20f, 0.25f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.30f, 0.35f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.40f, 1.00f);
 
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);

From 09bc6d507c787032d5923ceeec81a4c1dd0aebf1 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 8 Oct 2024 20:57:36 +0500
Subject: [PATCH 14/30] Updated info in common and args

---
 common/arg.cpp  | 18 ++----------------
 common/common.h |  7 ++-----
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index eb5f35f34364a..c9883b7e5f5c2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -975,7 +975,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_sparam());
     add_opt(llama_arg(
         {"--xtc-t"}, "N",
-        format("xtc threshold (default: %.1f, 0.0 or 1.0 = disabled)", (double)params.sparams.xtc_t),
+        format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_t),
         [](gpt_params & params, const std::string & value) {
             params.sparams.xtc_t = std::stof(value);
         }
@@ -1859,23 +1859,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
-    add_opt(llama_arg(
-        {"--slots"},
-        format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.endpoint_slots = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
-    add_opt(llama_arg(
-        {"--props"},
-        format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.endpoint_props = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
     add_opt(llama_arg(
         {"--no-slots"},
-        "disables slots monitoring endpoint",
+        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
         [](gpt_params & params) {
             params.endpoint_slots = false;
         }
diff --git a/common/common.h b/common/common.h
index 4e2bbb9de4674..a4bb13afdbb67 100644
--- a/common/common.h
+++ b/common/common.h
@@ -110,7 +110,7 @@ struct gpt_sampler_params {
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
     float   xtc_p             = 0.50f; // 0.0 = disabled
-    float   xtc_t             = 0.10f; // 0.0 or 1.0 = disabled
+    float   xtc_t             = 0.10f; // 1.0 = disabled
     float   xtc_t_max         = 1.00f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
@@ -294,10 +294,7 @@ struct gpt_params {
     std::string ssl_file_key  = "";                                                                         // NOLINT
     std::string ssl_file_cert = "";                                                                         // NOLINT
 
-    // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
-    bool endpoint_slots   = false;
-    bool endpoint_props   = false; // only control POST requests, not GET
+    bool endpoint_slots   = true;
     bool endpoint_metrics = false;
 
     bool log_json = false;

From c19fb26042fa9977f25e7b5460c907128acf73dc Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 8 Oct 2024 21:11:35 +0500
Subject: [PATCH 15/30] Merged back lost commits in common and arg

---
 common/arg.cpp  | 16 +++++++++++++++-
 common/common.h |  5 ++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c9883b7e5f5c2..c364e21e91369 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1859,9 +1859,23 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(llama_arg(
+        {"--slots"},
+        format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_slots = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
+    add_opt(llama_arg(
+        {"--props"},
+        format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_props = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
     add_opt(llama_arg(
         {"--no-slots"},
-        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        "disables slots monitoring endpoint",
         [](gpt_params & params) {
             params.endpoint_slots = false;
         }
diff --git a/common/common.h b/common/common.h
index a4bb13afdbb67..1fb2a652c4028 100644
--- a/common/common.h
+++ b/common/common.h
@@ -294,7 +294,10 @@ struct gpt_params {
     std::string ssl_file_key  = "";                                                                         // NOLINT
     std::string ssl_file_cert = "";                                                                         // NOLINT
 
-    bool endpoint_slots   = true;
+    // "advanced" endpoints are disabled by default for better security
+    bool webui            = true;
+    bool endpoint_slots   = false;
+    bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
 
     bool log_json = false;

From 6feb6b399c7ec8392da06748b1c25825c12dacae Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 8 Oct 2024 21:15:37 +0500
Subject: [PATCH 16/30] Update dump info in common

---
 common/common.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 00d3d707f8744..9522f54cbff4a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2088,9 +2088,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "xtc_p: %f # default: 0.0\n", sparams.xtc_p);
-    fprintf(stream, "xtc_t: %f # default: 0.0\n", sparams.xtc_t);
-    fprintf(stream, "xtc_t_max: %f # default: 0.0\n", sparams.xtc_t_max);
+    fprintf(stream, "xtc_p: %f # default: 0.5\n", sparams.xtc_p);
+    fprintf(stream, "xtc_t: %f # default: 0.1\n", sparams.xtc_t);
+    fprintf(stream, "xtc_t_max: %f # default: 1.0\n", sparams.xtc_t_max);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");

From d0b1053897655423f841716f0276e66886481d7b Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 9 Oct 2024 00:59:46 +0500
Subject: [PATCH 17/30] Fixed incorrect min_keep check

---
 src/llama-sampling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index b152c329a7533..afecff08953b4 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1109,7 +1109,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
     size_t to_remove = pos_last - (1 + pos_first);
 
-    if (to_remove < ctx->min_keep || to_remove < 1) return;
+    if (cur_p->size - to_remove < ctx->min_keep || to_remove < 1) return;
 
     for (size_t i = pos_first + 1; i < cur_p->size - to_remove + 1; ++i) {
         cur_p->data[i] = cur_p->data[i + to_remove];

From 37e02e34a18b51553c6ed4bc0debcbf751141e70 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Wed, 9 Oct 2024 14:08:02 +0500
Subject: [PATCH 18/30] Added XTC to README

---
 examples/main/README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/examples/main/README.md b/examples/main/README.md
index f0c3031ab130e..7d79ffa0cc6b6 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -241,6 +241,22 @@ The `--mirostat-ent` option sets the Mirostat target entropy (tau), which repres
 
 Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
+### XTC Sampling
+
+-   `--xtc-p N`: Sets the chance for token removal (checked once on sampler start) (default: 0.5).
+-   `--xtc-t N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+-   `--xtc-t-max N`: Sets a maximum probability threshold for tokens to be removed (highly expetrimental) (default: 1.0).
+
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive answers. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-t` threshold and above, then removes all such tokens except the least probable one.
+
+By removing top tokens XTC can improve variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last top token XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+
+The additional `xtc-t-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-t-max` on default 1.0 for all base/instruct models.
+
+Being experimental and unique, XTC is not included in the default sampling queue. You can start from a recommended combination of Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02`.
+
+Example usage: `--xtc-p 0.5 --xtc-t 0.1 --xtc-t-max 1.0`
+
 ### Logit Bias
 
 -   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.

From 2107882cf552d943e76982a9d61289dff6e1f5c2 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 10 Oct 2024 19:35:28 +0500
Subject: [PATCH 19/30] Renamed parameters, fixed info and defaults

* probability is at 0 by default, but XTC is included in sampling queue
* threshold higher than 0.5 switches XTC off
---
 common/arg.cpp          | 18 +++++++++---------
 common/common.cpp       |  6 +++---
 common/common.h         |  7 ++++---
 common/sampling.cpp     |  6 +++---
 examples/main/README.md | 16 ++++++++--------
 src/llama-sampling.cpp  |  2 +-
 6 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c364e21e91369..6342fad6c5a1c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -967,24 +967,24 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         }
     ).set_sparam());
     add_opt(llama_arg(
-        {"--xtc-p"}, "N",
-        format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_p),
+        {"-xtc-p", "--xtc-probability"}, "N",
+        format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
         [](gpt_params & params, const std::string & value) {
-            params.sparams.xtc_p = std::stof(value);
+            params.sparams.xtc_probability = std::stof(value);
         }
     ).set_sparam());
     add_opt(llama_arg(
-        {"--xtc-t"}, "N",
-        format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_t),
+        {"-xtc-t", "--xtc-threshold"}, "N",
+        format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
         [](gpt_params & params, const std::string & value) {
-            params.sparams.xtc_t = std::stof(value);
+            params.sparams.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
     add_opt(llama_arg(
-        {"--xtc-t-max"}, "N",
-        format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_t_max),
+        {"-xtc-t-max", "--xtc-threshold-max"}, "N",
+        format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_threshold_max),
         [](gpt_params & params, const std::string & value) {
-            params.sparams.xtc_t_max = std::stof(value);
+            params.sparams.xtc_threshold_max = std::stof(value);
         }
     ).set_sparam());
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 9522f54cbff4a..afe48a895c407 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2088,9 +2088,9 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "xtc_p: %f # default: 0.5\n", sparams.xtc_p);
-    fprintf(stream, "xtc_t: %f # default: 0.1\n", sparams.xtc_t);
-    fprintf(stream, "xtc_t_max: %f # default: 1.0\n", sparams.xtc_t_max);
+    fprintf(stream, "xtc_probability: %f # default: 0.5\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
+    fprintf(stream, "xtc_threshold_max: %f # default: 1.0\n", sparams.xtc_threshold_max);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 1fb2a652c4028..8ef1339823bc5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -109,9 +109,9 @@ struct gpt_sampler_params {
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
-    float   xtc_p             = 0.50f; // 0.0 = disabled
-    float   xtc_t             = 0.10f; // 1.0 = disabled
-    float   xtc_t_max         = 1.00f; // 0.0 = disabled
+    float   xtc_probability   = 0.00f; // 0.0 = disabled
+    float   xtc_threshold     = 0.10f; // 0.5 = disabled
+    float   xtc_threshold_max = 1.00f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
@@ -134,6 +134,7 @@ struct gpt_sampler_params {
         GPT_SAMPLER_TYPE_TYPICAL_P,
         GPT_SAMPLER_TYPE_TOP_P,
         GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_XTC,
         GPT_SAMPLER_TYPE_TEMPERATURE
     };
 
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 0c35044e9b518..ac1b6c416a68c 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,10 +130,10 @@ std::string gpt_sampler_params::print() const {
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_p = %.3f, xtc_t = %.3f, xtc_t_max = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, xtc_threshold_max = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, xtc_p, xtc_t, xtc_t_max, typ_p, temp,
+            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, xtc_threshold_max, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -185,7 +185,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
                     case GPT_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_p, params.xtc_t, params.xtc_t_max, params.min_keep, params.seed));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.xtc_threshold_max, params.min_keep, params.seed));
                         break;
                     case GPT_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
diff --git a/examples/main/README.md b/examples/main/README.md
index 7d79ffa0cc6b6..84ebdb787ba6a 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -243,19 +243,19 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
 ### XTC Sampling
 
--   `--xtc-p N`: Sets the chance for token removal (checked once on sampler start) (default: 0.5).
--   `--xtc-t N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
--   `--xtc-t-max N`: Sets a maximum probability threshold for tokens to be removed (highly expetrimental) (default: 1.0).
+-   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
+-   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
+-   `--xtc-threshold-max N`: Sets a maximum probability threshold for tokens to be removed (highly experimental) (default: 1.0).
 
-Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive answers. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-t` threshold and above, then removes all such tokens except the least probable one.
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
 
-By removing top tokens XTC can improve variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last top token XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
 
-The additional `xtc-t-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-t-max` on default 1.0 for all base/instruct models.
+The additional `xtc-threshold-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-threshold-max` on default 1.0 for all base/instruct models.
 
-Being experimental and unique, XTC is not included in the default sampling queue. You can start from a recommended combination of Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02`.
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 -xtc-p 0.5`.
 
-Example usage: `--xtc-p 0.5 --xtc-t 0.1 --xtc-t-max 1.0`
+Example usage: `-xtc-p 0.5 -xtc-t 0.1 -xtc-t-max 1.0`
 
 ### Logit Bias
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index afecff08953b4..90740829544c4 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1081,7 +1081,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     auto * ctx = (llama_sampler_xtc *) smpl->ctx;
 
     if (ctx->probability <= 0.0f
-        || ctx->threshold >= 1.0f
+        || ctx->threshold > 0.5f
         || ctx->threshold_max <= 0.0f
         || ctx->threshold_max <= ctx->threshold
         || cur_p->size <= 2) {

From f7a383ffb3583cfc0a7d0fc3202cbabf327a94f8 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 10 Oct 2024 21:48:49 +0500
Subject: [PATCH 20/30] Initial server support

---
 examples/server/server.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index aedfca0d6ea1c..cd4e635b9cbcc 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -891,6 +891,9 @@ struct server_context {
         slot.sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
         slot.sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
         slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
+        slot.sparams.xtc_probability   = json_value(data, "xtc_probability",   default_sparams.xtc_probability);
+        slot.sparams.xtc_threshold     = json_value(data, "xtc_threshold",     default_sparams.xtc_threshold);
+        slot.sparams.xtc_threshold_max = json_value(data, "xtc_threshold_max", default_sparams.xtc_threshold_max);
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
@@ -1239,6 +1242,9 @@ struct server_context {
             {"top_k",                     slot.sparams.top_k},
             {"top_p",                     slot.sparams.top_p},
             {"min_p",                     slot.sparams.min_p},
+            {"xtc_probability",           slot.sparams.xtc_probability},
+            {"xtc_threshold",             slot.sparams.xtc_threshold},
+            {"xtc_threshold_max",         slot.sparams.xtc_threshold_max},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
             {"repeat_last_n",             slot.sparams.penalty_last_n},

From 72db625bd46da193c36a123fd60a01a2f19cd7a8 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Thu, 10 Oct 2024 22:59:23 +0500
Subject: [PATCH 21/30] Added XTC to server UIs

---
 examples/server/public/index-new.html | 9 +++++++++
 examples/server/public/index.html     | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index c87dd8f1e1d32..2826d6133b7b3 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -43,6 +43,9 @@
       top_k: 0, // <= 0 to use vocab size
       top_p: 1.0, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // 0.5 = disabled;
+      xtc_threshold_max: 1.0, // 0 = disabled;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -836,6 +839,9 @@
           ${FloatField({ label: "TFS-Z", title: "Activates tail-free sampling, a method used to limit the prediction of tokens that are too frequent. The parameter z controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
           ${FloatField({ label: "Frequency Penalty", title: "A penalty that is applied based on the frequency with which certain tokens occur in the training data set. A higher value results in rare tokens being favoured.", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
           ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+          ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+          ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+          ${FloatField({ label: "XTC max threshold", title: "Sets a maximum probability threshold for tokens to be removed (highly experimental)", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
           ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
         </fieldset>
 
@@ -1132,6 +1138,9 @@ <h2>llama.cpp</h2>
   const snapSettings = {
     temperature: { snapValue: 1.0, snapRangeMultiplier: 6 },
     min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
+    xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
+    xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
+    xtc_threshold_max: { snapValue: 1.0, snapRangeMultiplier: 4 },
     top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
     tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
     typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 07fec6a38bbcd..eaaa9f7334c97 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -307,6 +307,9 @@
       top_k: 40, // <= 0 to use vocab size
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
+      xtc_probability: 0.0, // 0 = disabled;
+      xtc_threshold: 0.1, // 0.5 = disabled;
+      xtc_threshold_max: 1.0, // 0 = disabled;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -1013,6 +1016,9 @@
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
+              ${FloatField({ label: "xtc_probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "xtc_threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+              ${FloatField({ label: "xtc_threshold_max", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
             </fieldset>
             <hr />
             <fieldset class="three">

From 396836907125eb3be79d2778c79dc03e517051fe Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 11 Oct 2024 11:53:19 +0500
Subject: [PATCH 22/30] Fixed labels in old server UI

---
 examples/server/public/index.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index eaaa9f7334c97..ca1e70a135024 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -1016,9 +1016,9 @@
               ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
               ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
-              ${FloatField({ label: "xtc_probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
-              ${FloatField({ label: "xtc_threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
-              ${FloatField({ label: "xtc_threshold_max", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
+              ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
+              ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
+              ${FloatField({ label: "XTC upper threshold", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
             </fieldset>
             <hr />
             <fieldset class="three">

From acada1a5e734b5f40d9d9daa2cfd0a812d0300dd Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Fri, 11 Oct 2024 15:36:25 +0500
Subject: [PATCH 23/30] Made algorithm safer and more readable

---
 src/llama-sampling.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 90740829544c4..dd31ba9a55c55 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1107,15 +1107,18 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         }
     }
 
-    size_t to_remove = pos_last - (1 + pos_first);
+    int to_remove = pos_last - (1 + pos_first);
 
-    if (cur_p->size - to_remove < ctx->min_keep || to_remove < 1) return;
+    if (cur_p->size - to_remove >= ctx->min_keep && to_remove > 0) {
 
-    for (size_t i = pos_first + 1; i < cur_p->size - to_remove + 1; ++i) {
-        cur_p->data[i] = cur_p->data[i + to_remove];
-    }
+        size_t last_idx = cur_p->size - to_remove;
+
+        for (size_t i = pos_first + 1; i <= last_idx; ++i) {
+            cur_p->data[i] = cur_p->data[i + to_remove];
+        }
 
-    cur_p->size = cur_p->size - to_remove;
+        cur_p->size = cur_p->size - to_remove;
+    }
 }
 
 static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {

From 9c43a01c5de6ab34b403c82f24af837b31e40d40 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Sat, 12 Oct 2024 18:35:56 +0500
Subject: [PATCH 24/30] Removed xtc_threshold_max

---
 common/arg.cpp                        |  7 -------
 common/common.cpp                     |  1 -
 common/common.h                       |  1 -
 common/sampling.cpp                   |  6 +++---
 examples/main/README.md               |  5 +----
 examples/server/public/index-new.html |  3 ---
 examples/server/public/index.html     |  2 --
 examples/server/server.cpp            |  2 --
 include/llama.h                       |  2 +-
 src/llama-sampling.cpp                | 26 ++++++++------------------
 tests/test-sampling.cpp               | 21 ++++++++-------------
 11 files changed, 21 insertions(+), 55 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index d66ddb13f3efe..07092a7a4655b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -987,13 +987,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sparams.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
-    add_opt(common_arg(
-        {"-xtc-t-max", "--xtc-threshold-max"}, "N",
-        format("xtc upper threshold (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_threshold_max),
-        [](common_params & params, const std::string & value) {
-            params.sparams.xtc_threshold_max = std::stof(value);
-        }
-    ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
         format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
diff --git a/common/common.cpp b/common/common.cpp
index 005094ed49517..cfb5444c146a0 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2090,7 +2090,6 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
     fprintf(stream, "xtc_probability: %f # default: 0.5\n", sparams.xtc_probability);
     fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
-    fprintf(stream, "xtc_threshold_max: %f # default: 1.0\n", sparams.xtc_threshold_max);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
     fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 8d0ef1ad013fb..758ffe1028732 100644
--- a/common/common.h
+++ b/common/common.h
@@ -112,7 +112,6 @@ struct common_sampler_params {
     float   min_p             = 0.05f; // 0.0 = disabled
     float   xtc_probability   = 0.00f; // 0.0 = disabled
     float   xtc_threshold     = 0.10f; // 0.5 = disabled
-    float   xtc_threshold_max = 1.00f; // 0.0 = disabled
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 3673cc4938f34..fb95bcd3bf2b0 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -130,10 +130,10 @@ std::string common_sampler_params::print() const {
 
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, xtc_threshold_max = %.3f, typical_p = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
-            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, xtc_threshold_max, typ_p, temp,
+            top_k, tfs_z, top_p, min_p, xtc_probability, xtc_threshold, typ_p, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -185,7 +185,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                         llama_sampler_chain_add(result->chain, llama_sampler_init_min_p    (params.min_p, params.min_keep));
                         break;
                     case COMMON_SAMPLER_TYPE_XTC:
-                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.xtc_threshold_max, params.min_keep, params.seed));
+                        llama_sampler_chain_add(result->chain, llama_sampler_init_xtc      (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                         break;
                     case COMMON_SAMPLER_TYPE_TFS_Z:
                         llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
diff --git a/examples/main/README.md b/examples/main/README.md
index 84ebdb787ba6a..3f84bd853fc6f 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -245,17 +245,14 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 
 -   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
 -   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
--   `--xtc-threshold-max N`: Sets a maximum probability threshold for tokens to be removed (highly experimental) (default: 1.0).
 
 Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
 
 By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
 
-The additional `xtc-threshold-max` parameter may help with finetuned models that already give relatively creative output, meaning that clichés and repetitive phrases may appear at lower probabilities. It allows to remove tokens from a middle range which will always be specific to a model, requiring careful experimenting. Leave `xtc-threshold-max` on default 1.0 for all base/instruct models.
-
 Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 -xtc-p 0.5`.
 
-Example usage: `-xtc-p 0.5 -xtc-t 0.1 -xtc-t-max 1.0`
+Example usage: `-xtc-p 0.5 -xtc-t 0.1
 
 ### Logit Bias
 
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index 2826d6133b7b3..ee7a10f40181b 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -45,7 +45,6 @@
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
       xtc_probability: 0.0, // 0 = disabled;
       xtc_threshold: 0.1, // 0.5 = disabled;
-      xtc_threshold_max: 1.0, // 0 = disabled;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -841,7 +840,6 @@
           ${FloatField({ label: "Typical-P", title: "Activates local typical sampling, a method used to limit the prediction of tokens that are atypical in the current context. The parameter p controls the strength of this limitation. A value of 1.0 means that this function is deactivated.", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
           ${FloatField({ label: "XTC probability", title: "Sets the chance for token removal (checked once on sampler start)", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
           ${FloatField({ label: "XTC threshold", title: "Sets a minimum probability threshold for tokens to be removed", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
-          ${FloatField({ label: "XTC max threshold", title: "Sets a maximum probability threshold for tokens to be removed (highly experimental)", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
           ${IntField({ label: "Min Keep", title: "If greater than 0, samplers are forced to return N possible tokens at minimum. Default is 0", max: 10, min: 0, name: "min_keep", value: params.value.min_keep })}
         </fieldset>
 
@@ -1140,7 +1138,6 @@ <h2>llama.cpp</h2>
     min_p: { snapValue: 0.05, snapRangeMultiplier: 2 },
     xtc_probability: { snapValue: 0.0, snapRangeMultiplier: 4 },
     xtc_threshold: { snapValue: 0.5, snapRangeMultiplier: 4 },
-    xtc_threshold_max: { snapValue: 1.0, snapRangeMultiplier: 4 },
     top_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
     tfs_z: { snapValue: 1.0, snapRangeMultiplier: 4 },
     typical_p: { snapValue: 1.0, snapRangeMultiplier: 4 },
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index ca1e70a135024..88e6af08edfe0 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -309,7 +309,6 @@
       min_p: 0.05, // 0 = disabled
       xtc_probability: 0.0, // 0 = disabled;
       xtc_threshold: 0.1, // 0.5 = disabled;
-      xtc_threshold_max: 1.0, // 0 = disabled;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
@@ -1018,7 +1017,6 @@
               ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
               ${FloatField({ label: "XTC probability", max: 1.0, min: 0.0, name: "xtc_probability", step: 0.01, value: params.value.xtc_probability })}
               ${FloatField({ label: "XTC threshold", max: 0.5, min: 0.0, name: "xtc_threshold", step: 0.01, value: params.value.xtc_threshold })}
-              ${FloatField({ label: "XTC upper threshold", max: 1.0, min: 0.0, name: "xtc_threshold_max", step: 0.01, value: params.value.xtc_threshold_max })}
             </fieldset>
             <hr />
             <fieldset class="three">
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 71c90d62d873e..69505d69edb80 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -893,7 +893,6 @@ struct server_context {
         slot.sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
         slot.sparams.xtc_probability   = json_value(data, "xtc_probability",   default_sparams.xtc_probability);
         slot.sparams.xtc_threshold     = json_value(data, "xtc_threshold",     default_sparams.xtc_threshold);
-        slot.sparams.xtc_threshold_max = json_value(data, "xtc_threshold_max", default_sparams.xtc_threshold_max);
         slot.sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
         slot.sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
         slot.sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
@@ -1244,7 +1243,6 @@ struct server_context {
             {"min_p",                     slot.sparams.min_p},
             {"xtc_probability",           slot.sparams.xtc_probability},
             {"xtc_threshold",             slot.sparams.xtc_threshold},
-            {"xtc_threshold_max",         slot.sparams.xtc_threshold_max},
             {"tfs_z",                     slot.sparams.tfs_z},
             {"typical_p",                 slot.sparams.typ_p},
             {"repeat_last_n",             slot.sparams.penalty_last_n},
diff --git a/include/llama.h b/include/llama.h
index 15003916070cf..3e8ba42f8f95f 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1095,7 +1095,7 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
 
     /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
-    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t, float     t_max, size_t min_keep, uint32_t seed);
+    LLAMA_API struct llama_sampler * llama_sampler_init_xtc        (float   p, float   t,     size_t min_keep, uint32_t seed);
 
     /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index dd31ba9a55c55..225df5212ca3d 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1064,7 +1064,6 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
 struct llama_sampler_xtc {
     const float    probability;
     const float    threshold;
-    const float    threshold_max;
     const size_t   min_keep;
 
     const uint32_t seed;
@@ -1082,8 +1081,6 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
     if (ctx->probability <= 0.0f
         || ctx->threshold > 0.5f
-        || ctx->threshold_max <= 0.0f
-        || ctx->threshold_max <= ctx->threshold
         || cur_p->size <= 2) {
         return;
     }
@@ -1095,35 +1092,29 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     // in case it's not sorted/recalculated yet
     llama_sampler_softmax_impl(cur_p);
 
-    int pos_first = -1;
     int pos_last = 0;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         if (cur_p->data[i].p - ctx->threshold >= -1e-5) {
-            if (cur_p->data[i].p - ctx->threshold_max > 1e-3) pos_first = i;
             pos_last = i;
-        } else {
-            break;
-        }
+        } else break;
     }
 
-    int to_remove = pos_last - (1 + pos_first);
-
-    if (cur_p->size - to_remove >= ctx->min_keep && to_remove > 0) {
+    if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
 
-        size_t last_idx = cur_p->size - to_remove;
+        size_t last_idx = cur_p->size - pos_last;
 
-        for (size_t i = pos_first + 1; i <= last_idx; ++i) {
-            cur_p->data[i] = cur_p->data[i + to_remove];
+        for (size_t i = 0; i <= last_idx; ++i) {
+            cur_p->data[i] = cur_p->data[i + pos_last];
         }
 
-        cur_p->size = cur_p->size - to_remove;
+        cur_p->size = cur_p->size - pos_last;
     }
 }
 
 static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
-    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->threshold_max, ctx->min_keep, ctx->seed);
+    auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
 
     // copy the state
     {
@@ -1154,14 +1145,13 @@ static struct llama_sampler_i llama_sampler_xtc_i = {
     /* .free   = */ llama_sampler_xtc_free,
 };
 
-struct llama_sampler * llama_sampler_init_xtc(float p, float t, float t_max, size_t min_keep, uint32_t seed) {
+struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
     auto seed_cur = get_rng_seed(seed);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_xtc_i,
         /* .ctx   = */ new llama_sampler_xtc {
             /* .probability   = */ p,
             /* .threshold     = */ t,
-            /* .threshold_max = */ t_max,
             /* .min_keep      = */ min_keep,
             /* .seed          = */ seed,
             /* .seed_cur      = */ seed_cur,
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 5716f7393ab70..c0c7e127d583e 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -111,7 +111,7 @@ static void test_min_p(const std::vector<float> & probs, const std::vector<float
     }
 }
 
-static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t, float t_max) {
+static void test_xtc(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p, float t) {
     const size_t n_vocab = probs.size();
 
     std::vector<llama_token_data> cur;
@@ -124,7 +124,7 @@ static void test_xtc(const std::vector<float> & probs, const std::vector<float>
     llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
     APPLY(llama_sampler_init_softmax(), &cur_p);
     DUMP(&cur_p);
-    APPLY(llama_sampler_init_xtc(p, t, t_max, 0, 0), &cur_p);
+    APPLY(llama_sampler_init_xtc(p, t, 0, 0), &cur_p);
     DUMP(&cur_p);
 
     GGML_ASSERT(cur_p.size == expected_probs.size());
@@ -306,7 +306,7 @@ static void test_perf() {
     BENCH(llama_sampler_init_min_p    (0.2f, 1),                data, 32);
     BENCH(llama_sampler_init_tail_free(0.5f, 1),                data, 32);
     BENCH(llama_sampler_init_typical  (0.5f, 1),                data, 32);
-    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 0.8f, 1, 1), data, 32);
+    BENCH(llama_sampler_init_xtc      (1.0f, 0.1f, 1, 1),       data, 32);
     BENCH(llama_sampler_init_softmax  (),                       data, 32);
 }
 
@@ -333,17 +333,12 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
     printf("XTC should:\n");
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                         0.99f, 0.10f, 1.00f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.1f},                   0.99f, 0.10f, 0.35f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                   0.99f, 0.20f, 1.00f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},             0.99f, 0.30f, 1.00f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.1f},             0.99f, 0.10f, 0.25f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.2f, 0.1f},             0.99f, 0.20f, 0.35f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.10f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.20f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.30f);
+
     printf("XTC should not:\n");
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.10f, 0.15f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.20f, 0.25f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.30f, 0.35f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},       0.99f, 0.40f, 1.00f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.40f);
 
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);

From cca842fbd3694db303063a17caee6720520c2495 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Sat, 12 Oct 2024 18:46:13 +0500
Subject: [PATCH 25/30] Fixed arg after update

---
 common/arg.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 97c353580c1c2..d7fd212541e2e 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -949,14 +949,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_sparam());
     add_opt(common_arg(
         {"-xtc-p", "--xtc-probability"}, "N",
-        format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
         [](common_params & params, const std::string & value) {
             params.sparams.xtc_probability = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"-xtc-t", "--xtc-threshold"}, "N",
-        format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
         [](common_params & params, const std::string & value) {
             params.sparams.xtc_threshold = std::stof(value);
         }

From 44bbd6337adb7b595379bcf69daba706383ccf87 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Mon, 14 Oct 2024 11:43:45 +0500
Subject: [PATCH 26/30] Quick fixes by comments

---
 common/arg.cpp                        | 4 ++--
 common/common.cpp                     | 2 +-
 common/common.h                       | 2 +-
 examples/server/public/index-new.html | 2 +-
 examples/server/public/index.html     | 2 +-
 src/llama-sampling.cpp                | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index d7fd212541e2e..86f951aabadcc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -948,14 +948,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"-xtc-p", "--xtc-probability"}, "N",
+        {"--xtc-probability"}, "N",
         string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
         [](common_params & params, const std::string & value) {
             params.sparams.xtc_probability = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
-        {"-xtc-t", "--xtc-threshold"}, "N",
+        {"--xtc-threshold"}, "N",
         string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
         [](common_params & params, const std::string & value) {
             params.sparams.xtc_threshold = std::stof(value);
diff --git a/common/common.cpp b/common/common.cpp
index 873b2c69403b3..c08f01b429056 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2104,7 +2104,7 @@ void yaml_dump_non_result_info(FILE * stream, const common_params & params, cons
     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
-    fprintf(stream, "xtc_probability: %f # default: 0.5\n", sparams.xtc_probability);
+    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
     fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
     fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
     fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 932c3f47f3b92..3a378408ee3e7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -111,7 +111,7 @@ struct common_sampler_params {
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
     float   xtc_probability   = 0.00f; // 0.0 = disabled
-    float   xtc_threshold     = 0.10f; // 0.5 = disabled
+    float   xtc_threshold     = 0.10f; // > 0.5 disables XTC
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
     float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index ee7a10f40181b..ad4183cd928f7 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -44,7 +44,7 @@
       top_p: 1.0, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled; recommended for non-english: ~ 0.4
       xtc_probability: 0.0, // 0 = disabled;
-      xtc_threshold: 0.1, // 0.5 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 88e6af08edfe0..88065705fb669 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -308,7 +308,7 @@
       top_p: 0.95, // 1.0 = disabled
       min_p: 0.05, // 0 = disabled
       xtc_probability: 0.0, // 0 = disabled;
-      xtc_threshold: 0.1, // 0.5 = disabled;
+      xtc_threshold: 0.1, // > 0.5 disables XTC;
       tfs_z: 1.0, // 1.0 = disabled
       typical_p: 1.0, // 1.0 = disabled
       presence_penalty: 0.0, // 0.0 = disabled
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 225df5212ca3d..708d5e669835a 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1081,7 +1081,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
     if (ctx->probability <= 0.0f
         || ctx->threshold > 0.5f
-        || cur_p->size <= 2) {
+        || cur_p->size < 2) {
         return;
     }
 

From 436a9919e3e18b11b11c8362109a0c9c972ffeb0 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Mon, 14 Oct 2024 16:10:13 +0500
Subject: [PATCH 27/30] Simplified algorithm since threshold_max is removed

---
 src/llama-sampling.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 708d5e669835a..57779a7398d88 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1101,13 +1101,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     }
 
     if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
-
-        size_t last_idx = cur_p->size - pos_last;
-
-        for (size_t i = 0; i <= last_idx; ++i) {
-            cur_p->data[i] = cur_p->data[i + pos_last];
-        }
-
+        cur_p->data += pos_last;
         cur_p->size = cur_p->size - pos_last;
     }
 }

From 3613a6d27b2670e3746b2db6eea7956eba08a5e8 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Mon, 14 Oct 2024 18:36:03 +0500
Subject: [PATCH 28/30] Renamed random distribution

---
 src/llama-sampling.cpp  | 4 ++--
 tests/test-sampling.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 57779a7398d88..2be5a1b7fcfcf 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1085,8 +1085,8 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    std::uniform_real_distribution<float> distance(0.0f, 1.0f);
-    float chance = distance(ctx->rng);
+    std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
+    float chance = distribution(ctx->rng);
     if (chance > ctx->probability) return;
 
     // in case it's not sorted/recalculated yet
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index c0c7e127d583e..0368aca9be8de 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -285,7 +285,7 @@ static void bench(llama_sampler * cnstr, const char * cnstr_name, const std::vec
     }
     const int64_t t_end = ggml_time_us();
     llama_sampler_free(cnstr);
-    printf("%-47s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
+    printf("%-43s: %8.3f us/iter\n", cnstr_name, (t_end - t_start) / (float)n_iter);
 }
 
 #define BENCH(__cnstr, __data, __n_iter) bench((__cnstr), #__cnstr, (__data), (__n_iter))

From 2be814aa692c606930bcff58c5f1e6888be6d1ed Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 15 Oct 2024 09:46:04 +0500
Subject: [PATCH 29/30] Fixed tests and outdated README

---
 examples/main/README.md | 6 +++---
 src/llama-sampling.cpp  | 2 +-
 tests/test-sampling.cpp | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 3f84bd853fc6f..4c16d554583dd 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -246,13 +246,13 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
 -   `--xtc-probability N`: Sets the chance for token removal (checked once on sampler start) (default: 0.0).
 -   `--xtc-threshold N`: Sets a minimum probability threshold for tokens to be removed (default: 0.1).
 
-Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-p` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
+Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
 
 By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
 
-Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 -xtc-p 0.5`.
+Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
 
-Example usage: `-xtc-p 0.5 -xtc-t 0.1
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1
 
 ### Logit Bias
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 2be5a1b7fcfcf..0f1cab8b24908 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1095,7 +1095,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
     int pos_last = 0;
 
     for (size_t i = 0; i < cur_p->size; ++i) {
-        if (cur_p->data[i].p - ctx->threshold >= -1e-5) {
+        if (cur_p->data[i].p >= ctx->threshold) {
             pos_last = i;
         } else break;
     }
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 0368aca9be8de..1372bdf13f2f6 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -333,12 +333,12 @@ int main(void) {
     test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
 
     printf("XTC should:\n");
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.10f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.20f);
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.30f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.1f},                                0.99f, 0.09f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.2f, 0.1f},                          0.99f, 0.19f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.3f, 0.2f, 0.1f},                    0.99f, 0.29f);
 
     printf("XTC should not:\n");
-    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.40f);
+    test_xtc({0.4f, 0.3f, 0.2f, 0.1f},   {0.4f, 0.3f, 0.2f, 0.1f},              0.99f, 0.39f);
 
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
     test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);

From 3496f584cc2a23f2301c76d0f9030112320d7c89 Mon Sep 17 00:00:00 2001
From: MaggotHATE <clay1326@gmail.com>
Date: Tue, 15 Oct 2024 11:23:11 +0500
Subject: [PATCH 30/30] Small fixes

---
 examples/main/README.md | 2 +-
 src/llama-sampling.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 4c16d554583dd..620934dad4ad5 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -252,7 +252,7 @@ By removing top tokens XTC can improve the variety of answers, break writing cli
 
 Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
 
-Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1
+Example usage: `--xtc-probability 0.5 --xtc-threshold 0.1`
 
 ### Logit Bias
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 0f1cab8b24908..67a78c3ac4fe8 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1102,7 +1102,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
 
     if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
         cur_p->data += pos_last;
-        cur_p->size = cur_p->size - pos_last;
+        cur_p->size -= pos_last;
     }
 }