@@ -85,7 +85,7 @@ struct llama_model {
85
85
86
86
// load the model's weights from a file
87
87
bool llama_model_load (const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
88
- printf ( " %s: loading model from '%s' - please wait ...\n " , __func__, fname.c_str ());
88
+ fprintf (stderr, " %s: loading model from '%s' - please wait ...\n " , __func__, fname.c_str ());
89
89
90
90
std::vector<char > f_buf (1024 *1024 );
91
91
@@ -127,16 +127,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
127
127
n_ff = ((2 *(4 *hparams.n_embd )/3 + hparams.n_mult - 1 )/hparams.n_mult )*hparams.n_mult ;
128
128
n_parts = LLAMA_N_PARTS.at (hparams.n_embd );
129
129
130
- printf ( " %s: n_vocab = %d\n " , __func__, hparams.n_vocab );
131
- printf ( " %s: n_ctx = %d\n " , __func__, hparams.n_ctx );
132
- printf ( " %s: n_embd = %d\n " , __func__, hparams.n_embd );
133
- printf ( " %s: n_mult = %d\n " , __func__, hparams.n_mult );
134
- printf ( " %s: n_head = %d\n " , __func__, hparams.n_head );
135
- printf ( " %s: n_layer = %d\n " , __func__, hparams.n_layer );
136
- printf ( " %s: n_rot = %d\n " , __func__, hparams.n_rot );
137
- printf ( " %s: f16 = %d\n " , __func__, hparams.f16 );
138
- printf ( " %s: n_ff = %d\n " , __func__, n_ff);
139
- printf ( " %s: n_parts = %d\n " , __func__, n_parts);
130
+ fprintf (stderr, " %s: n_vocab = %d\n " , __func__, hparams.n_vocab );
131
+ fprintf (stderr, " %s: n_ctx = %d\n " , __func__, hparams.n_ctx );
132
+ fprintf (stderr, " %s: n_embd = %d\n " , __func__, hparams.n_embd );
133
+ fprintf (stderr, " %s: n_mult = %d\n " , __func__, hparams.n_mult );
134
+ fprintf (stderr, " %s: n_head = %d\n " , __func__, hparams.n_head );
135
+ fprintf (stderr, " %s: n_layer = %d\n " , __func__, hparams.n_layer );
136
+ fprintf (stderr, " %s: n_rot = %d\n " , __func__, hparams.n_rot );
137
+ fprintf (stderr, " %s: f16 = %d\n " , __func__, hparams.f16 );
138
+ fprintf (stderr, " %s: n_ff = %d\n " , __func__, n_ff);
139
+ fprintf (stderr, " %s: n_parts = %d\n " , __func__, n_parts);
140
140
}
141
141
142
142
// load vocab
@@ -161,7 +161,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
161
161
vocab.id_to_token [i] = word;
162
162
163
163
// if (i < 30000) {
164
- // printf( "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
164
+ // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
165
165
// }
166
166
}
167
167
}
@@ -220,7 +220,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
220
220
221
221
ctx_size += (5 + 10 *n_layer)*256 ; // object overhead
222
222
223
- printf ( " %s: ggml ctx size = %6.2f MB\n " , __func__, ctx_size/(1024.0 *1024.0 ));
223
+ fprintf (stderr, " %s: ggml ctx size = %6.2f MB\n " , __func__, ctx_size/(1024.0 *1024.0 ));
224
224
}
225
225
226
226
// create the ggml context
@@ -307,7 +307,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
307
307
308
308
const size_t memory_size = ggml_nbytes (model.memory_k ) + ggml_nbytes (model.memory_v );
309
309
310
- printf ( " %s: memory_size = %8.2f MB, n_mem = %d\n " , __func__, memory_size/1024.0 /1024.0 , n_mem);
310
+ fprintf (stderr, " %s: memory_size = %8.2f MB, n_mem = %d\n " , __func__, memory_size/1024.0 /1024.0 , n_mem);
311
311
}
312
312
313
313
const size_t file_offset = fin.tellg ();
@@ -325,7 +325,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
325
325
fname_part += " ." + std::to_string (i);
326
326
}
327
327
328
- printf ( " %s: loading model part %d/%d from '%s'\n " , __func__, i+1 , n_parts, fname_part.c_str ());
328
+ fprintf (stderr, " %s: loading model part %d/%d from '%s'\n " , __func__, i+1 , n_parts, fname_part.c_str ());
329
329
330
330
fin = std::ifstream (fname_part, std::ios::binary);
331
331
fin.rdbuf ()->pubsetbuf (f_buf.data (), f_buf.size ());
@@ -336,7 +336,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
336
336
int n_tensors = 0 ;
337
337
size_t total_size = 0 ;
338
338
339
- printf ( " %s: " , __func__);
339
+ fprintf (stderr, " %s: " , __func__);
340
340
341
341
while (true ) {
342
342
int32_t n_dims;
@@ -436,7 +436,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
436
436
437
437
if (0 ) {
438
438
static const char * ftype_str[] = { " f32" , " f16" , " q4_0" , " q4_1" , };
439
- printf ( " %24s - [%5d, %5d], type = %6s, split = %d\n " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype], split_type);
439
+ fprintf (stderr, " %24s - [%5d, %5d], type = %6s, split = %d\n " , name.data (), ne[0 ], ne[1 ], ftype_str[ftype], split_type);
440
440
}
441
441
442
442
size_t bpe = 0 ;
@@ -499,16 +499,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
499
499
total_size += ggml_nbytes (tensor)/n_parts;
500
500
}
501
501
502
- // printf( "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
502
+ // fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
503
503
if (++n_tensors % 8 == 0 ) {
504
- printf ( " ." );
505
- fflush (stdout );
504
+ fprintf (stderr, " ." );
505
+ fflush (stderr );
506
506
}
507
507
}
508
508
509
- printf ( " done\n " );
509
+ fprintf (stderr, " done\n " );
510
510
511
- printf ( " %s: model size = %8.2f MB / num tensors = %d\n " , __func__, total_size/1024.0 /1024.0 , n_tensors);
511
+ fprintf (stderr, " %s: model size = %8.2f MB / num tensors = %d\n " , __func__, total_size/1024.0 /1024.0 , n_tensors);
512
512
}
513
513
514
514
fin.close ();
@@ -552,7 +552,7 @@ bool llama_eval(
552
552
553
553
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
554
554
const size_t buf_size_new = 1.1 *(mem_per_token*N); // add 10% to account for ggml object overhead
555
- // printf( "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
555
+ // fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
556
556
557
557
// reallocate
558
558
buf_size = buf_size_new;
@@ -744,7 +744,7 @@ bool llama_eval(
744
744
if (mem_per_token == 0 ) {
745
745
mem_per_token = ggml_used_mem (ctx0)/N;
746
746
}
747
- // printf( "used_mem = %zu\n", ggml_used_mem(ctx0));
747
+ // fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
748
748
749
749
ggml_free (ctx0);
750
750
@@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
780
780
params.seed = time (NULL );
781
781
}
782
782
783
- printf ( " %s: seed = %d\n " , __func__, params.seed );
783
+ fprintf (stderr, " %s: seed = %d\n " , __func__, params.seed );
784
784
785
785
std::mt19937 rng (params.seed );
786
786
if (params.prompt .empty ()) {
@@ -822,13 +822,13 @@ int main(int argc, char ** argv) {
822
822
// tokenize the reverse prompt
823
823
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize (vocab, params.antiprompt , false );
824
824
825
- printf ( " \n " );
826
- printf ( " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
827
- printf ( " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
825
+ fprintf (stderr, " \n " );
826
+ fprintf (stderr, " %s: prompt: '%s'\n " , __func__, params.prompt .c_str ());
827
+ fprintf (stderr, " %s: number of tokens in prompt = %zu\n " , __func__, embd_inp.size ());
828
828
for (int i = 0 ; i < (int ) embd_inp.size (); i++) {
829
- printf ( " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
829
+ fprintf (stderr, " %6d -> '%s'\n " , embd_inp[i], vocab.id_to_token .at (embd_inp[i]).c_str ());
830
830
}
831
- printf ( " \n " );
831
+ fprintf (stderr, " \n " );
832
832
if (params.interactive ) {
833
833
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
834
834
struct sigaction sigint_action;
@@ -838,19 +838,19 @@ int main(int argc, char ** argv) {
838
838
sigaction (SIGINT, &sigint_action, NULL );
839
839
#endif
840
840
841
- printf ( " %s: interactive mode on.\n " , __func__);
841
+ fprintf (stderr, " %s: interactive mode on.\n " , __func__);
842
842
843
843
if (antiprompt_inp.size ()) {
844
- printf ( " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
845
- printf ( " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
844
+ fprintf (stderr, " %s: reverse prompt: '%s'\n " , __func__, params.antiprompt .c_str ());
845
+ fprintf (stderr, " %s: number of tokens in reverse prompt = %zu\n " , __func__, antiprompt_inp.size ());
846
846
for (int i = 0 ; i < (int ) antiprompt_inp.size (); i++) {
847
- printf ( " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
847
+ fprintf (stderr, " %6d -> '%s'\n " , antiprompt_inp[i], vocab.id_to_token .at (antiprompt_inp[i]).c_str ());
848
848
}
849
- printf ( " \n " );
849
+ fprintf (stderr, " \n " );
850
850
}
851
851
}
852
- printf ( " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
853
- printf ( " \n\n " );
852
+ fprintf (stderr, " sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n " , params.temp , params.top_k , params.top_p , params.repeat_last_n , params.repeat_penalty );
853
+ fprintf (stderr, " \n\n " );
854
854
855
855
std::vector<gpt_vocab::id> embd;
856
856
@@ -864,7 +864,7 @@ int main(int argc, char ** argv) {
864
864
865
865
866
866
if (params.interactive ) {
867
- printf ( " == Running in interactive mode. ==\n "
867
+ fprintf (stderr, " == Running in interactive mode. ==\n "
868
868
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
869
869
" - Press Ctrl+C to interject at any time.\n "
870
870
#endif
@@ -892,7 +892,7 @@ int main(int argc, char ** argv) {
892
892
const int64_t t_start_us = ggml_time_us ();
893
893
894
894
if (!llama_eval (model, params.n_threads , n_past, embd, logits, mem_per_token)) {
895
- printf ( " Failed to predict\n " );
895
+ fprintf (stderr, " Failed to predict\n " );
896
896
return 1 ;
897
897
}
898
898
@@ -1005,7 +1005,7 @@ int main(int argc, char ** argv) {
1005
1005
1006
1006
// end of text token
1007
1007
if (embd.back () == 2 ) {
1008
- printf ( " [end of text]\n " );
1008
+ fprintf (stderr, " [end of text]\n " );
1009
1009
break ;
1010
1010
}
1011
1011
}
@@ -1015,12 +1015,12 @@ int main(int argc, char ** argv) {
1015
1015
{
1016
1016
const int64_t t_main_end_us = ggml_time_us ();
1017
1017
1018
- printf ( " \n\n " );
1019
- printf ( " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
1020
- printf ( " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
1021
- printf ( " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
1022
- printf ( " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
1023
- printf ( " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
1018
+ fprintf (stderr, " \n\n " );
1019
+ fprintf (stderr, " %s: mem per token = %8zu bytes\n " , __func__, mem_per_token);
1020
+ fprintf (stderr, " %s: load time = %8.2f ms\n " , __func__, t_load_us/1000 .0f );
1021
+ fprintf (stderr, " %s: sample time = %8.2f ms\n " , __func__, t_sample_us/1000 .0f );
1022
+ fprintf (stderr, " %s: predict time = %8.2f ms / %.2f ms per token\n " , __func__, t_predict_us/1000 .0f , t_predict_us/1000 .0f /n_past);
1023
+ fprintf (stderr, " %s: total time = %8.2f ms\n " , __func__, (t_main_end_us - t_main_start_us)/1000 .0f );
1024
1024
}
1025
1025
1026
1026
ggml_free (model.ctx );
0 commit comments