@@ -919,7 +919,8 @@ int main(int argc, char ** argv) {
919
919
" - If you want to submit another line, end your input in '\\ '.\n " );
920
920
}
921
921
922
- int remaining_tokens = params.n_predict ;
922
+ // we may want to slide the input window along with the context, but for now we restrict to the context length
923
+ int remaining_tokens = model.hparams .n_ctx - embd_inp.size ();
923
924
int input_consumed = 0 ;
924
925
bool input_noecho = true ;
925
926
@@ -935,7 +936,7 @@ int main(int argc, char ** argv) {
935
936
936
937
937
938
938
- while (true ) {
939
+ while (remaining_tokens > 0 ) {
939
940
// predict
940
941
if (embd.size () > 0 ) {
941
942
const int64_t t_start_us = ggml_time_us ();
@@ -980,7 +981,7 @@ int main(int argc, char ** argv) {
980
981
input_noecho = false ;
981
982
982
983
// decrement remaining sampling budget
983
- // --remaining_tokens;
984
+ --remaining_tokens;
984
985
} else {
985
986
// some user input remains from prompt or interaction, forward it to processing
986
987
while (embd_inp.size () > input_consumed) {
@@ -1054,6 +1055,8 @@ int main(int argc, char ** argv) {
1054
1055
embd_inp.insert (embd_inp.end (), line_inp.begin (), line_inp.end ());
1055
1056
embd_inp.insert (embd_inp.end (), response_inp.begin (), response_inp.end ());
1056
1057
1058
+ remaining_tokens -= prompt_inp.size () + line_inp.size () + response_inp.size ();
1059
+
1057
1060
input_noecho = true ; // do not echo this again
1058
1061
}
1059
1062
0 commit comments