@@ -76,21 +76,25 @@ int main(int argc, char ** argv) {
76
76
// params.prompt = R"(// this function checks if the number n is prime
77
77
// bool is_prime(int n) {)";
78
78
79
- int64_t t_load_us = 0 ;
80
-
81
79
// load the model
82
- llama_context* ctx_ptr = llama_init_from_params (params);
80
+ llama_context* ctx_ptr = nullptr ;
81
+ {
82
+ ctx_ptr = llama_init_from_params (params);
83
+ if (!ctx_ptr) {
84
+ fprintf (stderr, " %s: failed to load model from '%s'\n " , __func__, params.model .c_str ());
85
+ return 1 ;
86
+ }
87
+ }
88
+
83
89
llama_context & ctx = *ctx_ptr;
84
- gpt_vocab & vocab = llama_context_get_vocab (ctx);
85
-
86
- // print system information
87
- llama_print_context_info (ctx);
90
+ const gpt_vocab & vocab = llama_context_get_vocab (ctx);
88
91
89
92
// Add a space in front of the first character to match OG llama tokenizer behavior
90
93
params.prompt .insert (0 , 1 , ' ' );
91
94
92
95
// tokenize the reverse prompt
93
- std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text (ctx, params.prompt );
96
+ std::vector<gpt_vocab::id> antiprompt_inp = llama_tokenize_text (ctx, params.antiprompt );
97
+
94
98
95
99
if (params.interactive ) {
96
100
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
@@ -126,8 +130,6 @@ int main(int argc, char ** argv) {
126
130
" - If you want to submit another line, end your input in '\\ '.\n " );
127
131
}
128
132
129
- bool input_noecho = false ;
130
-
131
133
// prompt user immediately after the starting prompt has been loaded
132
134
if (params.interactive_start ) {
133
135
is_interacting = true ;
@@ -138,39 +140,44 @@ int main(int argc, char ** argv) {
138
140
printf (ANSI_COLOR_YELLOW);
139
141
}
140
142
141
- if (!llama_ingest_input (ctx, params.prompt ))
143
+ // Prepare the context with input
144
+ // Send "beginning of string"
145
+ llama_add_bos (ctx);
146
+
147
+ // load the input
148
+ llama_update_input (ctx, params.prompt );
149
+
150
+ llama_print_startup_stats (ctx);
151
+
152
+ if (!llama_prepare_context (ctx))
142
153
{
143
- fprintf (stderr, " Failed to ingest prompt \n " );
154
+ fprintf (stderr, " %s: failed to prepare context \n " , __func__ );
144
155
return 1 ;
145
- };
146
-
147
- // display text
148
- input_noecho = false ;
149
- const std::vector<gpt_vocab::id>& embd = llama_context_get_embedding (ctx);
150
- if (!input_noecho) {
151
- for (auto id : embd) {
152
- printf (" %s" , vocab.id_to_token [id].c_str ());
153
- }
154
- fflush (stdout);
155
156
}
156
157
157
- if (!input_noecho && params.use_color ) {
158
- printf (ANSI_COLOR_RESET);
159
- }
160
-
161
- const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens (ctx);
162
-
163
- while (llama_context_is_finished (ctx) != true ) {
164
- gpt_vocab::id model_output = 0 ;
165
- bool response = llama_infer (ctx, model_output);
166
- if (response) {
167
- printf (" %s" , vocab.id_to_token [model_output].c_str ());
168
- fflush (stdout);
158
+ bool input_noecho = false ;
159
+ bool is_end_of_text = false ;
160
+ while (llama_context_is_finished (ctx) == false ) {
161
+ std::string model_output{};
162
+
163
+ if (llama_has_unconsumed_input (ctx)) {
164
+ llama_ingest_all_pending_input (ctx, !input_noecho);
165
+ // reset color to default if we there is no pending user input
166
+ if (!input_noecho && params.use_color ) {
167
+ printf (ANSI_COLOR_RESET);
168
+ }
169
+ }else {
170
+ // Run inference if we don't have any pending input
171
+ llama_infer (ctx, model_output, is_end_of_text);
172
+ // print the single token output
173
+ printf (" %s" , model_output.c_str ());
174
+ input_noecho = false ;
169
175
}
170
176
171
177
// in interactive mode, and not currently processing queued inputs;
172
178
// check if we should prompt the user for more
173
- if (params.interactive ) {
179
+ if (params.interactive && !llama_has_unconsumed_input (ctx)) {
180
+ const std::vector<gpt_vocab::id>& last_n_tokens = llama_context_get_last_n_tokens (ctx);
174
181
// check for reverse prompt
175
182
if (antiprompt_inp.size () && std::equal (antiprompt_inp.rbegin (), antiprompt_inp.rend (), last_n_tokens.rbegin ())) {
176
183
// reverse prompt found
@@ -200,32 +207,39 @@ int main(int argc, char ** argv) {
200
207
buf[n_read] = ' \n ' ;
201
208
buf[n_read+1 ] = 0 ;
202
209
}
210
+
203
211
// Do not clear existing context in interactive mode
204
- llama_update_context_with_prompt (ctx, buf, false );
212
+ llama_update_input (ctx, buf);
213
+ input_noecho = true ; // do not echo this again
205
214
}
206
215
207
216
is_interacting = false ;
208
217
}
209
218
}
210
219
211
220
// end of text token
212
- if (embd. back () == 2 ) {
221
+ if (is_end_of_text ) {
213
222
fprintf (stderr, " [end of text]\n " );
214
223
break ;
215
224
}
216
225
}
217
226
218
- // report timing from context
227
+
228
+ #if defined (_WIN32)
229
+ signal (SIGINT, SIG_DFL);
230
+ #endif
231
+
232
+ // report timing
219
233
{
220
234
const int64_t t_main_end_us = ggml_time_us ();
221
235
llama_print_end_stats (ctx);
222
236
fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
223
237
}
224
- llama_free_context (ctx_ptr);
238
+
239
+ llama_free_context (ctx_ptr);
225
240
226
241
if (params.use_color ) {
227
242
printf (ANSI_COLOR_RESET);
228
243
}
229
-
230
244
return 0 ;
231
245
}
0 commit comments