From 572960a04563b27b3b69cca661cc6cffba9dd4c5 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Tue, 23 Apr 2024 22:25:46 -0500
Subject: [PATCH 1/4] fix: revert showing control tokens by default

---
 common/common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index 06f252ea6914b..1a66f219b7f61 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2328,10 +2328,10 @@ std::vector<llama_token> llama_tokenize(
 
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);

From 206c974eb67ca9ee800ad265c4c9abcda632c5b1 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Tue, 23 Apr 2024 22:50:22 -0500
Subject: [PATCH 2/4] feat: revert changes to default behavior of
 llama_token_to_piece; provide overridden declaration to receive "bool
 special" param to toggle showing control tokens

---
 common/common.cpp | 19 +++++++++++++++++--
 common/common.h   |  9 ++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1a66f219b7f61..35c91792b45a5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2328,10 +2328,25 @@ std::vector<llama_token> llama_tokenize(
 
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
     if (n_tokens < 0) {
         result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
+// duplicate with ability to specify whether to use special token
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
         GGML_ASSERT(check == -n_tokens);
     } else {
         result.resize(n_tokens);
diff --git a/common/common.h b/common/common.h
index cca44268e6df5..9dbf9c362d668 100644
--- a/common/common.h
+++ b/common/common.h
@@ -241,7 +241,14 @@ std::vector<llama_token> llama_tokenize(
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
         const struct llama_context * ctx,
-                       llama_token   token);
+                       llama_token   token
+);
+
+std::string llama_token_to_piece(
+        const struct llama_context * ctx,
+                       llama_token   token,
+                       bool          special
+);
 
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space

From 6c081e501caa281f9de6780e025896c4508b5739 Mon Sep 17 00:00:00 2001
From: Kyle Mistele <kyle@mistele.com>
Date: Tue, 23 Apr 2024 22:51:01 -0500
Subject: [PATCH 3/4] feat: use the overridden declaration of
 llama_token_to_piece from common/common.cpp to specify "false" so that
 control tokens are not shown in chat completion responses"

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 25bc296396772..ced8f584f41a1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1118,7 +1118,7 @@ struct server_context {
 
     bool process_token(completion_token_output & result, server_slot & slot) {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
         slot.sampled = result.tok;
 
         // search stop word and delete it

From 411137a60878a782e13cde666948b2e25adf225a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 24 Apr 2024 13:14:14 +0300
Subject: [PATCH 4/4] common : simplify

---
 common/common.cpp | 15 ---------------
 common/common.h   | 10 ++--------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 35c91792b45a5..3c6f98f17bac3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2326,21 +2326,6 @@ std::vector<llama_token> llama_tokenize(
     return result;
 }
 
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
-    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true);
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
-// duplicate with ability to specify whether to use special token
 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
diff --git a/common/common.h b/common/common.h
index 9dbf9c362d668..157b54a3e9e08 100644
--- a/common/common.h
+++ b/common/common.h
@@ -237,18 +237,12 @@ std::vector<llama_token> llama_tokenize(
                         bool   add_special,
                         bool   parse_special = false);
 
-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
-        const struct llama_context * ctx,
-                       llama_token   token
-);
-
 std::string llama_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token,
-                       bool          special
-);
+                       bool          special = true);
 
 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space