jart · jart · May 21, 2024 · May 19, 2024 · May 20, 2024 · May 20, 2024
diff --git a/common/common.cpp b/common/common.cpp
@@ -905,6 +905,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.interactive_specials = true;
         return true;
     }
+    if (arg == "--no-special") {
+        params.no_special = true;
+        return true;
+    }
     if (arg == "--embedding") {
         params.embedding = true;
         return true;
@@ -1434,6 +1438,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -i, --interactive     run in interactive mode\n");
     printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
     printf("  --interactive-first   run in interactive mode and wait for input right away\n");
+    printf("  --no-special          control tokens output disabled\n");
     printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
     printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");

diff --git a/common/common.h b/common/common.h
@@ -142,6 +142,7 @@ struct gpt_params {
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
     bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
+    bool no_special        = false; // disable control token output
     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
     bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -18,6 +18,8 @@
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
 #include <unistd.h>
+#include <fcntl.h>
+#define SPECIAL_FILENO 3
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
@@ -118,6 +120,16 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
 }
 
 int main(int argc, char ** argv) {
+#ifndef _MSC_VER
+    // Check if we have an external attachment to a file descriptor for out of band control tokens (e.g. bash `3>/dev/null` )
+    // Placed here to avoid file descriptor being polluted by gpt_params_parse() opening files
+    const bool control_token_file_descriptor_is_attached = fcntl(SPECIAL_FILENO, F_GETFL) != -1;
+    if (!control_token_file_descriptor_is_attached) {
+        // Duplicate stdout file descriptor to control token file descriptor to merge the two streams
+        dup2(STDOUT_FILENO, SPECIAL_FILENO);
+    }
+#endif
+
     gpt_params params;
     g_params = &params;
 
@@ -126,6 +138,8 @@ int main(int argc, char ** argv) {
     }
     llama_sampling_params & sparams = params.sparams;
 
+    const bool control_token_allowed_on_standard_stream = !params.conversation && sparams.grammar.empty();
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
     LOG_TEE("Log start\n");
@@ -528,8 +542,6 @@ int main(int argc, char ** argv) {
         exit(1);
     }
 
-    bool should_show_special_tokens = sparams.grammar.empty();
-
     while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
         // predict
         if (!embd.empty()) {
@@ -742,18 +754,39 @@ int main(int argc, char ** argv) {
         // display text
         if (input_echo && display) {
             for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation && should_show_special_tokens);
-                printf("%s", token_str.c_str());
-
+                const std::string token_str = llama_token_to_piece(ctx, id);
+
+                // Console/Stream Output
+                if (!llama_token_is_control_token(llama_get_model(ctx), id)) {
+                    // Stream Output Token To Standard Output
+                    fprintf(stdout, "%s", token_str.c_str());
+                } else if (!params.no_special) {
+#ifndef _MSC_VER
+                    if (control_token_file_descriptor_is_attached) {
+                        // Stream Control Token To Special Token Output. Useful for debugging control token behaviour
+                        (void)! write(SPECIAL_FILENO, token_str.c_str(), token_str.length());
+                    } else
+#endif
+                    if (control_token_allowed_on_standard_stream)
+                    {
+                        // Stream Control Token To Standard Output Stream
+                        fprintf(stdout, "%s", token_str.c_str());
+                    }
+                }
+                // Record Displayed Tokens To Log
+                // Note: Generated tokens are created one by one hence this check
                 if (embd.size() > 1) {
+                    // Incoming Requested Tokens
                     input_tokens.push_back(id);
                 } else {
+                    // Outgoing Generated Tokens
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
+                fflush(stdout);
             }
-            fflush(stdout);
         }
+
         // reset color to default if there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
             console::set_display(console::reset);
@@ -908,7 +941,7 @@ int main(int argc, char ** argv) {
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token, should_show_special_tokens);
+                        output_ss << llama_token_to_piece(ctx, token);
                     }
 
                     n_remain -= line_inp.size();

diff --git a/llama.cpp b/llama.cpp
@@ -17634,6 +17634,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
     );
 }
 
+bool llama_token_is_control_token(const struct llama_model * model, llama_token token) {
+    return llama_is_control_token(model->vocab, token);
+}
+
 llama_token llama_token_bos(const struct llama_model * model) {
     return model->vocab.special_bos_id;
 }

diff --git a/llama.h b/llama.h
@@ -816,6 +816,9 @@ extern "C" {
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
 
+    // Identify if Token Id is a control token or a render-able token
+    LLAMA_API bool llama_token_is_control_token(const struct llama_model * model, llama_token token);
+
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence