@@ -129,6 +129,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
129
129
if (params.n_threads <= 0 ) {
130
130
params.n_threads = std::thread::hardware_concurrency ();
131
131
}
132
+ } else if (arg == " -tb" || arg == " --threads-batch" ) {
133
+ if (++i >= argc) {
134
+ invalid_param = true ;
135
+ break ;
136
+ }
137
+ params.n_threads_batch = std::stoi (argv[i]);
138
+ if (params.n_threads_batch <= 0 ) {
139
+ params.n_threads_batch = std::thread::hardware_concurrency ();
140
+ }
132
141
} else if (arg == " -p" || arg == " --prompt" ) {
133
142
if (++i >= argc) {
134
143
invalid_param = true ;
@@ -451,12 +460,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
451
460
params.mul_mat_q = false ;
452
461
#else
453
462
fprintf (stderr, " warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n " );
454
- #endif // GGML_USE_CUBLAS
455
- } else if (arg == " --low-vram" || arg == " -lv" ) {
456
- #ifdef GGML_USE_CUBLAS
457
- params.low_vram = true ;
458
- #else
459
- fprintf (stderr, " warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n " );
460
463
#endif // GGML_USE_CUBLAS
461
464
} else if (arg == " --no-mmap" ) {
462
465
params.use_mmap = false ;
@@ -630,7 +633,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
630
633
printf (" (can be specified more than once for multiple prompts).\n " );
631
634
printf (" --color colorise output to distinguish prompt and user input from generations\n " );
632
635
printf (" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n " );
633
- printf (" -t N, --threads N number of threads to use during computation (default: %d)\n " , params.n_threads );
636
+ printf (" -t N, --threads N number of threads to use during generation (default: %d)\n " , params.n_threads );
637
+ printf (" -tb N, --threads-batch N\n " );
638
+ printf (" number of threads to use during batch and prompt processing (default: same as --threads)\n " );
634
639
printf (" -p PROMPT, --prompt PROMPT\n " );
635
640
printf (" prompt to start generation with (default: empty)\n " );
636
641
printf (" -e, --escape process prompt escapes sequences (\\ n, \\ r, \\ t, \\ ', \\\" , \\\\ )\n " );
@@ -645,7 +650,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
645
650
printf (" -f FNAME, --file FNAME\n " );
646
651
printf (" prompt file to start generation.\n " );
647
652
printf (" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n " , params.n_predict );
648
- printf (" -c N, --ctx-size N size of the prompt context (default: %d)\n " , params.n_ctx );
653
+ printf (" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model )\n " , params.n_ctx );
649
654
printf (" -b N, --batch-size N batch size for prompt processing (default: %d)\n " , params.n_batch );
650
655
printf (" --top-k N top-k sampling (default: %d, 0 = disabled)\n " , params.top_k );
651
656
printf (" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n " , (double )params.top_p );
@@ -705,7 +710,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
705
710
printf (" -ts SPLIT --tensor-split SPLIT\n " );
706
711
printf (" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
707
712
printf (" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
708
- printf (" -lv, --low-vram don't allocate VRAM scratch buffer\n " );
709
713
#ifdef GGML_USE_CUBLAS
710
714
printf (" -nommq, --no-mul-mat-q\n " );
711
715
printf (" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n " );
@@ -726,6 +730,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
726
730
printf (" \n " );
727
731
}
728
732
733
+ std::string get_system_info (const gpt_params & params) {
734
+ std::ostringstream os;
735
+
736
+ os << " system_info: n_threads = " << params.n_threads ;
737
+ if (params.n_threads_batch != -1 ) {
738
+ os << " (n_threads_batch = " << params.n_threads_batch << " )" ;
739
+ }
740
+ os << " / " << std::thread::hardware_concurrency () << " | " << llama_print_system_info ();
741
+
742
+ return os.str ();
743
+ }
744
+
729
745
std::string gpt_random_prompt (std::mt19937 & rng) {
730
746
const int r = rng () % 10 ;
731
747
switch (r) {
@@ -749,40 +765,50 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
749
765
// Model utils
750
766
//
751
767
752
- struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params) {
753
- auto lparams = llama_context_default_params ();
768
+ struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params) {
769
+ auto mparams = llama_model_default_params ();
754
770
755
- lparams.n_ctx = params.n_ctx ;
756
- lparams.n_batch = params.n_batch ;
757
771
if (params.n_gpu_layers != -1 ) {
758
- lparams .n_gpu_layers = params.n_gpu_layers ;
772
+ mparams .n_gpu_layers = params.n_gpu_layers ;
759
773
}
760
- lparams.main_gpu = params.main_gpu ;
761
- lparams.tensor_split = params.tensor_split ;
762
- lparams.low_vram = params.low_vram ;
763
- lparams.mul_mat_q = params.mul_mat_q ;
764
- lparams.seed = params.seed ;
765
- lparams.f16_kv = params.memory_f16 ;
766
- lparams.use_mmap = params.use_mmap ;
767
- lparams.use_mlock = params.use_mlock ;
768
- lparams.logits_all = params.logits_all ;
769
- lparams.embedding = params.embedding ;
770
- lparams.rope_freq_base = params.rope_freq_base ;
771
- lparams.rope_freq_scale = params.rope_freq_scale ;
772
-
773
- return lparams;
774
+ mparams.main_gpu = params.main_gpu ;
775
+ mparams.tensor_split = params.tensor_split ;
776
+ mparams.use_mmap = params.use_mmap ;
777
+ mparams.use_mlock = params.use_mlock ;
778
+
779
+ return mparams;
780
+ }
781
+
782
+ struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params) {
783
+ auto cparams = llama_context_default_params ();
784
+
785
+ cparams.n_ctx = params.n_ctx ;
786
+ cparams.n_batch = params.n_batch ;
787
+ cparams.n_threads = params.n_threads ;
788
+ cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch ;
789
+ cparams.mul_mat_q = params.mul_mat_q ;
790
+ cparams.seed = params.seed ;
791
+ cparams.f16_kv = params.memory_f16 ;
792
+ cparams.logits_all = params.logits_all ;
793
+ cparams.embedding = params.embedding ;
794
+ cparams.rope_freq_base = params.rope_freq_base ;
795
+ cparams.rope_freq_scale = params.rope_freq_scale ;
796
+
797
+ return cparams;
774
798
}
775
799
776
800
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params (gpt_params & params) {
777
- auto lparams = llama_context_params_from_gpt_params (params);
801
+ auto mparams = llama_model_params_from_gpt_params (params);
778
802
779
- llama_model * model = llama_load_model_from_file (params.model .c_str (), lparams );
803
+ llama_model * model = llama_load_model_from_file (params.model .c_str (), mparams );
780
804
if (model == NULL ) {
781
805
fprintf (stderr, " %s: error: failed to load model '%s'\n " , __func__, params.model .c_str ());
782
806
return std::make_tuple (nullptr , nullptr );
783
807
}
784
808
785
- llama_context * lctx = llama_new_context_with_model (model, lparams);
809
+ auto cparams = llama_context_params_from_gpt_params (params);
810
+
811
+ llama_context * lctx = llama_new_context_with_model (model, cparams);
786
812
if (lctx == NULL ) {
787
813
fprintf (stderr, " %s: error: failed to create context with model '%s'\n " , __func__, params.model .c_str ());
788
814
llama_free_model (model);
@@ -815,7 +841,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
815
841
LOG (" warming up the model with an empty run\n " );
816
842
817
843
std::vector<llama_token> tmp = { llama_token_bos (lctx), llama_token_eos (lctx), };
818
- llama_decode (lctx, llama_batch_get_one (tmp.data (), std::min (tmp.size (), (size_t ) params.n_batch ), 0 , 0 ), params. n_threads );
844
+ llama_decode (lctx, llama_batch_get_one (tmp.data (), std::min (tmp.size (), (size_t ) params.n_batch ), 0 , 0 ));
819
845
llama_kv_cache_tokens_rm (lctx, -1 , -1 );
820
846
llama_reset_timings (lctx);
821
847
}
@@ -828,16 +854,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
828
854
//
829
855
830
856
std::vector<llama_token> llama_tokenize (
831
- struct llama_context * ctx,
857
+ const struct llama_context * ctx,
858
+ const std::string & text,
859
+ bool add_bos) {
860
+ return llama_tokenize (llama_get_model (ctx), text, add_bos);
861
+ }
862
+
863
+ std::vector<llama_token> llama_tokenize (
864
+ const struct llama_model * model,
832
865
const std::string & text,
833
866
bool add_bos) {
834
867
// upper limit for the number of tokens
835
868
int n_tokens = text.length () + add_bos;
836
869
std::vector<llama_token> result (n_tokens);
837
- n_tokens = llama_tokenize (ctx , text.data (), text.length (), result.data (), result.size (), add_bos);
870
+ n_tokens = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_bos);
838
871
if (n_tokens < 0 ) {
839
872
result.resize (-n_tokens);
840
- int check = llama_tokenize (ctx , text.data (), text.length (), result.data (), result.size (), add_bos);
873
+ int check = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_bos);
841
874
GGML_ASSERT (check == -n_tokens);
842
875
} else {
843
876
result.resize (n_tokens);
@@ -847,10 +880,10 @@ std::vector<llama_token> llama_tokenize(
847
880
848
881
std::string llama_token_to_piece (const struct llama_context * ctx, llama_token token) {
849
882
std::vector<char > result (8 , 0 );
850
- const int n_tokens = llama_token_to_piece (ctx, token, result.data (), result.size ());
883
+ const int n_tokens = llama_token_to_piece (llama_get_model ( ctx) , token, result.data (), result.size ());
851
884
if (n_tokens < 0 ) {
852
885
result.resize (-n_tokens);
853
- int check = llama_token_to_piece (ctx, token, result.data (), result.size ());
886
+ int check = llama_token_to_piece (llama_get_model ( ctx) , token, result.data (), result.size ());
854
887
GGML_ASSERT (check == -n_tokens);
855
888
} else {
856
889
result.resize (n_tokens);
@@ -905,7 +938,7 @@ llama_token llama_sample_token(
905
938
std::vector<llama_token_data> & candidates,
906
939
int idx) {
907
940
const int n_ctx = llama_n_ctx (ctx);
908
- const int n_vocab = llama_n_vocab (ctx);
941
+ const int n_vocab = llama_n_vocab (llama_get_model ( ctx) );
909
942
910
943
const float temp = params.temp ;
911
944
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k ;
@@ -1191,7 +1224,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1191
1224
#endif // NDEBUG
1192
1225
1193
1226
fprintf (stream, " model_desc: %s\n " , model_desc);
1194
- fprintf (stream, " n_vocab: %d # output size of the final layer, 32001 for some models\n " , llama_n_vocab (lctx));
1227
+ fprintf (stream, " n_vocab: %d # output size of the final layer, 32001 for some models\n " , llama_n_vocab (llama_get_model ( lctx) ));
1195
1228
1196
1229
#ifdef __OPTIMIZE__
1197
1230
fprintf (stream, " optimize: true\n " );
@@ -1258,7 +1291,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1258
1291
fprintf (stream, " - %s: %f\n " , std::get<0 >(la).c_str (), std::get<1 >(la));
1259
1292
}
1260
1293
fprintf (stream, " lora_base: %s\n " , params.lora_base .c_str ());
1261
- fprintf (stream, " low_vram: %s # default: false\n " , params.low_vram ? " true" : " false" );
1262
1294
fprintf (stream, " main_gpu: %d # default: 0\n " , params.main_gpu );
1263
1295
fprintf (stream, " memory_f32: %s # default: false\n " , !params.memory_f16 ? " true" : " false" );
1264
1296
fprintf (stream, " mirostat: %d # default: 0 (disabled)\n " , params.mirostat );
0 commit comments