Refactor main.cpp

thomasantony · thomasantony · commit a9c0e1d88de4 · 2023-03-17T19:47:39.000-07:00
diff --git a/main.cpp b/main.cpp
@@ -76,21 +76,25 @@ int main(int argc, char ** argv) {
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
 
-    int64_t t_load_us = 0;
-
     // load the model
-    llama_context* ctx_ptr = llama_init_from_params(params);
+    llama_context* ctx_ptr = nullptr;
+    {
+        ctx_ptr = llama_init_from_params(params);
+        if (!ctx_ptr) {  
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+    
     llama_context & ctx = *ctx_ptr;
-    gpt_vocab & vocab = llama_context_get_vocab(ctx);
-
-    // print system information
-    llama_print_context_info(ctx);
+    const gpt_vocab & vocab = llama_context_get_vocab(ctx);
 
     // Add a space in front of the first character to match OG llama tokenizer behavior
     params.prompt.insert(0, 1, ' ');
 
     // tokenize the reverse prompt
-    std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text(ctx, params.prompt);
+    std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text(ctx, params.antiprompt);
+
 
     if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -126,8 +130,6 @@ int main(int argc, char ** argv) {
                " - If you want to submit another line, end your input in '\\'.\n");
     }
 
-    bool input_noecho = false;
-
     // prompt user immediately after the starting prompt has been loaded
     if (params.interactive_start) {
         is_interacting = true;
@@ -138,39 +140,44 @@ int main(int argc, char ** argv) {
         printf(ANSI_COLOR_YELLOW);
     }
 
-    if(!llama_ingest_input(ctx, params.prompt))
+    // Prepare the context with input
+    // Send "beginning of string"
+    llama_add_bos(ctx);
+
+    // load the input
+    llama_update_input(ctx, params.prompt);
+
+    llama_print_startup_stats(ctx);
+
+    if(!llama_prepare_context(ctx))
     {
-        fprintf(stderr, "Failed to ingest prompt\n");
+        fprintf(stderr, "%s: failed to prepare context\n", __func__);
         return 1;
-    };
-
-    // display text
-    input_noecho = false;
-    const std::vector<gpt_vocab::id>& embd = llama_context_get_embedding(ctx);
-    if (!input_noecho) {
-            for (auto id : embd) {
-            printf("%s", vocab.id_to_token[id].c_str());
-        }   
-        fflush(stdout);
     }
 
-    if (!input_noecho && params.use_color) {
-        printf(ANSI_COLOR_RESET);
-    }
-
-    const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens(ctx);
-
-    while (llama_context_is_finished(ctx) != true) {
-        gpt_vocab::id model_output = 0;
-        bool response = llama_infer(ctx, model_output);
-        if (response) {
-            printf("%s", vocab.id_to_token[model_output].c_str());
-            fflush(stdout);
+    bool input_noecho = false;
+    bool is_end_of_text = false;
+    while (llama_context_is_finished(ctx) == false) {
+        std::string model_output{};
+
+        if (llama_has_unconsumed_input(ctx)) {
+            llama_ingest_all_pending_input(ctx, !input_noecho);
+            // reset color to default if we there is no pending user input
+            if (!input_noecho && params.use_color) {
+                printf(ANSI_COLOR_RESET);
+            }
+        }else{
+            // Run inference if we don't have any pending input
+            llama_infer(ctx, model_output, is_end_of_text);
+            // print the single token output
+            printf("%s", model_output.c_str());
+            input_noecho = false;
         }
 
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more
-        if (params.interactive) {
+        if (params.interactive && !llama_has_unconsumed_input(ctx)) {
+            const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens(ctx);
             // check for reverse prompt
             if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
                 // reverse prompt found
@@ -200,32 +207,39 @@ int main(int argc, char ** argv) {
                         buf[n_read] = '\n';
                         buf[n_read+1] = 0;
                     }
+
                     // Do not clear existing context in interactive mode
-                    llama_update_context_with_prompt(ctx, buf, false);
+                    llama_update_input(ctx, buf);
+                    input_noecho = true; // do not echo this again
                 }
 
                 is_interacting = false;
             }
         }
 
         // end of text token
-        if (embd.back() == 2) {
+        if (is_end_of_text) {
             fprintf(stderr, " [end of text]\n");
             break;
         }
     }
     
-    // report timing from context
+
+#if defined (_WIN32)
+    signal(SIGINT, SIG_DFL);
+#endif
+
+    // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();
         llama_print_end_stats(ctx);
         fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
     }
-    llama_free_context(ctx_ptr);    
+
+    llama_free_context(ctx_ptr);
 
     if (params.use_color) {
         printf(ANSI_COLOR_RESET);
     }
-
     return 0;
 }