Skip to content

main: use seperate stream for control characters #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.interactive_specials = true;
return true;
}
if (arg == "--no-special") {
params.no_special = true;
return true;
}
if (arg == "--embedding") {
params.embedding = true;
return true;
Expand Down Expand Up @@ -1434,6 +1438,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" --no-special control tokens output disabled\n");
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ struct gpt_params {
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool no_special = false; // disable control token output
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
Expand Down
47 changes: 40 additions & 7 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#include <fcntl.h>
#define SPECIAL_FILENO 3
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
Expand Down Expand Up @@ -118,6 +120,16 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
}

int main(int argc, char ** argv) {
#ifndef _MSC_VER
// Check if we have an external attachment to a file descriptor for out of band control tokens (e.g. bash `3>/dev/null` )
// Placed here to avoid file descriptor being polluted by gpt_params_parse() opening files
const bool control_token_file_descriptor_is_attached = fcntl(SPECIAL_FILENO, F_GETFL) != -1;
if (!control_token_file_descriptor_is_attached) {
// Duplicate stdout file descriptor to control token file descriptor to merge the two streams
dup2(STDOUT_FILENO, SPECIAL_FILENO);
}
#endif

gpt_params params;
g_params = &params;

Expand All @@ -126,6 +138,8 @@ int main(int argc, char ** argv) {
}
llama_sampling_params & sparams = params.sparams;

const bool control_token_allowed_on_standard_stream = !params.conversation && sparams.grammar.empty();

#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
LOG_TEE("Log start\n");
Expand Down Expand Up @@ -528,8 +542,6 @@ int main(int argc, char ** argv) {
exit(1);
}

bool should_show_special_tokens = sparams.grammar.empty();

while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
Expand Down Expand Up @@ -742,18 +754,39 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation && should_show_special_tokens);
printf("%s", token_str.c_str());

const std::string token_str = llama_token_to_piece(ctx, id);

// Console/Stream Output
if (!llama_token_is_control_token(llama_get_model(ctx), id)) {
// Stream Output Token To Standard Output
fprintf(stdout, "%s", token_str.c_str());
} else if (!params.no_special) {
#ifndef _MSC_VER
if (control_token_file_descriptor_is_attached) {
// Stream Control Token To Special Token Output. Useful for debugging control token behaviour
(void)! write(SPECIAL_FILENO, token_str.c_str(), token_str.length());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extraneous ! here?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also I'm not sure why you'd have (void) before write but not fprintf

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

intentional or it breaks make command

(void) write(SPECIAL_FILENO, token_str.c_str(), token_str.length());
examples/main/main.cpp: In function ‘int main(int, char**)’:
examples/main/main.cpp:767:37: warning: ignoring return value of ‘ssize_t write(int, const void*, size_t)’ declared with attribute ‘warn_unused_result’ [-Wunused-result]
  767 |                         (void) write(SPECIAL_FILENO, token_str.c_str(), token_str.length());
      |                                ~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, apparently gcc and glibc have collaborated to do an extremely dumb thing. fine, keep it.

Copy link
Author

@mofosyne mofosyne May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Context: https://bugs.llvm.org/show_bug.cgi?id=51228
Context: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425

A possible appproach according to @mrdomino

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-result"
write(...);
#pragma GCC diagnostic pop

} else
#endif
if (control_token_allowed_on_standard_stream)
{
// Stream Control Token To Standard Output Stream
fprintf(stdout, "%s", token_str.c_str());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

stdout is buffered, so if you are going to be using it, you will need to make sure fflush(stdout) is called before every write to the token fd. Otherwise the tokens will not interleave correctly.

You could also switch to calling write on fd 1 and make the output fully unbuffered, but I think this would be suboptimal.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added fflush(stdout) before every fprintf() now

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why wouldn't you flush after each print?

Copy link

@mrdomino mrdomino May 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jart my theory was that you'd want to take advantage of the buffering. If you don't care about buffering output then absolutely, or even setbuf(stdout, NULL) at the start of the program.

ETA: the other thing is that if you have anything else doing anything to stdout it can mess with the interleaving... it just seemed simpler to me to constrain it to where you were doing the token write

}
}
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id);
} else {
// Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
}
fflush(stdout);
}
fflush(stdout);
}

// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
Expand Down Expand Up @@ -908,7 +941,7 @@ int main(int argc, char ** argv) {
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
output_ss << llama_token_to_piece(ctx, token, should_show_special_tokens);
output_ss << llama_token_to_piece(ctx, token);
}

n_remain -= line_inp.size();
Expand Down
4 changes: 4 additions & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17634,6 +17634,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
);
}

bool llama_token_is_control_token(const struct llama_model * model, llama_token token) {
return llama_is_control_token(model->vocab, token);
}

llama_token llama_token_bos(const struct llama_model * model) {
return model->vocab.special_bos_id;
}
Expand Down
3 changes: 3 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,9 @@ extern "C" {
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);

// Identify if Token Id is a control token or a render-able token
LLAMA_API bool llama_token_is_control_token(const struct llama_model * model, llama_token token);

// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
Expand Down
Loading