diff --git a/common/common.cpp b/common/common.cpp index e624fc7f35352..ffc98137dbab8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -905,6 +905,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive_specials = true; return true; } + if (arg == "--no-special") { + params.no_special = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -1434,6 +1438,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" --no-special control tokens output disabled\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); diff --git a/common/common.h b/common/common.h index 566490e2f881a..87e771ca10a02 100644 --- a/common/common.h +++ b/common/common.h @@ -142,6 +142,7 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode + bool no_special = false; // disable control token output bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 8153a71fb5791..fc54861811184 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -18,6 +18,8 @@ #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) #include #include +#include +#define SPECIAL_FILENO 3 #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX @@ -118,6 +120,16 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v } int main(int argc, char ** argv) { +#ifndef _MSC_VER + // Check if we have an external attachment to a file descriptor for out of band control tokens (e.g. bash `3>/dev/null` ) + // Placed here to avoid file descriptor being polluted by gpt_params_parse() opening files + const bool control_token_file_descriptor_is_attached = fcntl(SPECIAL_FILENO, F_GETFL) != -1; + if (!control_token_file_descriptor_is_attached) { + // Duplicate stdout file descriptor to control token file descriptor to merge the two streams + dup2(STDOUT_FILENO, SPECIAL_FILENO); + } +#endif + gpt_params params; g_params = ¶ms; @@ -126,6 +138,8 @@ int main(int argc, char ** argv) { } llama_sampling_params & sparams = params.sparams; + const bool control_token_allowed_on_standard_stream = !params.conversation && sparams.grammar.empty(); + #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("main", "log")); LOG_TEE("Log start\n"); @@ -528,8 +542,6 @@ int main(int argc, char ** argv) { exit(1); } - bool should_show_special_tokens = sparams.grammar.empty(); - while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { @@ -742,18 +754,39 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation && should_show_special_tokens); - printf("%s", token_str.c_str()); - + const std::string token_str = llama_token_to_piece(ctx, id); + + // Console/Stream Output + if (!llama_token_is_control_token(llama_get_model(ctx), id)) { + // Stream Output Token To Standard Output + fprintf(stdout, "%s", token_str.c_str()); + } else if (!params.no_special) { +#ifndef _MSC_VER + if (control_token_file_descriptor_is_attached) { + // Stream Control Token To Special Token Output. Useful for debugging control token behaviour + (void)! write(SPECIAL_FILENO, token_str.c_str(), token_str.length()); + } else +#endif + if (control_token_allowed_on_standard_stream) + { + // Stream Control Token To Standard Output Stream + fprintf(stdout, "%s", token_str.c_str()); + } + } + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { + // Incoming Requested Tokens input_tokens.push_back(id); } else { + // Outgoing Generated Tokens output_tokens.push_back(id); output_ss << token_str; } + fflush(stdout); } - fflush(stdout); } + // reset color to default if there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { console::set_display(console::reset); @@ -908,7 +941,7 @@ int main(int argc, char ** argv) { for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; output_tokens.push_back(token); - output_ss << llama_token_to_piece(ctx, token, should_show_special_tokens); + output_ss << llama_token_to_piece(ctx, token); } n_remain -= line_inp.size(); diff --git a/llama.cpp b/llama.cpp index b752ddc6b401f..f41c6e5b68192 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17634,6 +17634,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) { ); } +bool llama_token_is_control_token(const struct llama_model * model, llama_token token) { + return llama_is_control_token(model->vocab, token); +} + llama_token llama_token_bos(const struct llama_model * model) { return model->vocab.special_bos_id; } diff --git a/llama.h b/llama.h index 612e32c4ea058..7cacb3d645a40 100644 --- a/llama.h +++ b/llama.h @@ -816,6 +816,9 @@ extern "C" { // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); + // Identify if Token Id is a control token or a render-able token + LLAMA_API bool llama_token_is_control_token(const struct llama_model * model, llama_token token); + // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence