From 572960a04563b27b3b69cca661cc6cffba9dd4c5 Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Tue, 23 Apr 2024 22:25:46 -0500 Subject: [PATCH 1/4] fix: revert showing control tokens by default --- common/common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 06f252ea6914b..1a66f219b7f61 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2328,10 +2328,10 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); From 206c974eb67ca9ee800ad265c4c9abcda632c5b1 Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Tue, 23 Apr 2024 22:50:22 -0500 Subject: [PATCH 2/4] feat: revert changes to default behavior of llama_token_to_piece; provide overridden declaration to receive "bool special" param to toggle showing control tokens --- common/common.cpp | 19 +++++++++++++++++-- common/common.h | 9 ++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 1a66f219b7f61..35c91792b45a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2328,10 +2328,25 @@ std::vector llama_tokenize( std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); + GGML_ASSERT(check == -n_tokens); + } else { + result.resize(n_tokens); + } + + return std::string(result.data(), result.size()); +} + +// duplicate with ability to specify whether to use special token +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { + std::vector result(8, 0); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); + if (n_tokens < 0) { + result.resize(-n_tokens); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); diff --git a/common/common.h b/common/common.h index cca44268e6df5..9dbf9c362d668 100644 --- a/common/common.h +++ b/common/common.h @@ -241,7 +241,14 @@ std::vector llama_tokenize( // should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, - llama_token token); + llama_token token +); + +std::string llama_token_to_piece( + const struct llama_context * ctx, + llama_token token, + bool special +); // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // that takes into account the tokenizer type and decides how to handle the leading space From 6c081e501caa281f9de6780e025896c4508b5739 Mon Sep 17 00:00:00 2001 From: Kyle Mistele Date: Tue, 23 Apr 2024 22:51:01 -0500 Subject: [PATCH 3/4] feat: use the overridden declaration of llama_token_to_piece from common/common.cpp to specify "false" so that control tokens are not shown in chat completion responses" --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 25bc296396772..ced8f584f41a1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1118,7 +1118,7 @@ struct server_context { bool process_token(completion_token_output & result, server_slot & slot) { // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); + const std::string token_str = llama_token_to_piece(ctx, result.tok, false); slot.sampled = result.tok; // search stop word and delete it From 411137a60878a782e13cde666948b2e25adf225a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 24 Apr 2024 13:14:14 +0300 Subject: [PATCH 4/4] common : simplify --- common/common.cpp | 15 --------------- common/common.h | 10 ++-------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 35c91792b45a5..3c6f98f17bac3 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2326,21 +2326,6 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { - std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), true); - GGML_ASSERT(check == -n_tokens); - } else { - result.resize(n_tokens); - } - - return std::string(result.data(), result.size()); -} - -// duplicate with ability to specify whether to use special token std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); diff --git a/common/common.h b/common/common.h index 9dbf9c362d668..157b54a3e9e08 100644 --- a/common/common.h +++ b/common/common.h @@ -237,18 +237,12 @@ std::vector llama_tokenize( bool add_special, bool parse_special = false); -// tokenizes a token into a piece +// tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` -std::string llama_token_to_piece( - const struct llama_context * ctx, - llama_token token -); - std::string llama_token_to_piece( const struct llama_context * ctx, llama_token token, - bool special -); + bool special = true); // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // that takes into account the tokenizer type and decides how to handle the leading space