@@ -173,17 +173,13 @@ int main(int argc, char ** argv) {
173
173
174
174
llama_model * model;
175
175
llama_context * ctx;
176
- llama_context * ctx_guidance = NULL ;
176
+
177
177
g_model = &model;
178
178
g_ctx = &ctx;
179
179
180
180
// load the model and apply lora adapter, if any
181
181
LOG (" %s: load the model and apply lora adapter, if any\n " , __func__);
182
182
std::tie (model, ctx) = llama_init_from_gpt_params (params);
183
- if (sparams.cfg_scale > 1 .f ) {
184
- struct llama_context_params lparams = llama_context_params_from_gpt_params (params);
185
- ctx_guidance = llama_new_context_with_model (model, lparams);
186
- }
187
183
188
184
if (model == NULL ) {
189
185
LOG_TEE (" %s: error: unable to load model\n " , __func__);
@@ -239,25 +235,6 @@ int main(int argc, char ** argv) {
239
235
LOG (" embd_inp was considered empty and bos was added: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx, embd_inp).c_str ());
240
236
}
241
237
242
- // Tokenize negative prompt
243
- std::vector<llama_token> guidance_inp;
244
- int guidance_offset = 0 ;
245
- int original_prompt_len = 0 ;
246
- if (ctx_guidance) {
247
- LOG (" cfg_negative_prompt: \" %s\"\n " , log_tostr (sparams.cfg_negative_prompt ));
248
-
249
- guidance_inp = ::llama_tokenize (ctx_guidance, sparams.cfg_negative_prompt , true );
250
- LOG (" guidance_inp tokenized: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx_guidance, guidance_inp).c_str ());
251
-
252
- std::vector<llama_token> original_inp = ::llama_tokenize (ctx, params.prompt , true );
253
- LOG (" original_inp tokenized: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx, original_inp).c_str ());
254
-
255
- original_prompt_len = original_inp.size ();
256
- guidance_offset = (int )guidance_inp.size () - original_prompt_len;
257
- LOG (" original_prompt_len: %s" , log_tostr (original_prompt_len));
258
- LOG (" guidance_offset: %s" , log_tostr (guidance_offset));
259
- }
260
-
261
238
if ((int ) embd_inp.size () > n_ctx - 4 ) {
262
239
LOG_TEE (" %s: error: prompt is too long (%d tokens, max %d)\n " , __func__, (int ) embd_inp.size (), n_ctx - 4 );
263
240
return 1 ;
@@ -285,15 +262,6 @@ int main(int argc, char ** argv) {
285
262
LOG_TEE (" %6d -> '%s'\n " , embd_inp[i], llama_token_to_piece (ctx, embd_inp[i]).c_str ());
286
263
}
287
264
288
- if (ctx_guidance) {
289
- LOG_TEE (" \n " );
290
- LOG_TEE (" %s: negative prompt: '%s'\n " , __func__, sparams.cfg_negative_prompt .c_str ());
291
- LOG_TEE (" %s: number of tokens in negative prompt = %zu\n " , __func__, guidance_inp.size ());
292
- for (int i = 0 ; i < (int ) guidance_inp.size (); i++) {
293
- LOG_TEE (" %6d -> '%s'\n " , guidance_inp[i], llama_token_to_piece (ctx, guidance_inp[i]).c_str ());
294
- }
295
- }
296
-
297
265
if (params.n_keep > 0 ) {
298
266
LOG_TEE (" %s: static prompt based on n_keep: '" , __func__);
299
267
for (int i = 0 ; i < params.n_keep ; i++) {
@@ -361,12 +329,11 @@ int main(int argc, char ** argv) {
361
329
is_interacting = params.interactive_first ;
362
330
}
363
331
364
- bool input_echo = true ;
332
+ bool input_echo = true ;
365
333
366
- int n_past = 0 ;
367
- int n_remain = params.n_predict ;
368
- int n_consumed = 0 ;
369
- int n_past_guidance = 0 ;
334
+ int n_past = 0 ;
335
+ int n_remain = params.n_predict ;
336
+ int n_consumed = 0 ;
370
337
371
338
std::vector<int > input_tokens; g_input_tokens = &input_tokens;
372
339
std::vector<int > output_tokens; g_output_tokens = &output_tokens;
@@ -376,7 +343,6 @@ int main(int argc, char ** argv) {
376
343
console::set_display (console::prompt);
377
344
378
345
std::vector<llama_token> embd;
379
- std::vector<llama_token> embd_guidance;
380
346
381
347
struct llama_sampling_context * ctx_sampling = llama_sampling_init (sparams);
382
348
@@ -402,7 +368,7 @@ int main(int argc, char ** argv) {
402
368
// if we run out of context:
403
369
// - take the n_keep first tokens from the original prompt (via n_past)
404
370
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
405
- if (n_past + (int ) embd.size () + std::max< int >( 0 , guidance_offset) > n_ctx) {
371
+ if (n_past + (int ) embd.size () > n_ctx) {
406
372
if (params.n_predict == -2 ) {
407
373
LOG_TEE (" \n\n %s: context full and n_predict == -%d => stopping\n " , __func__, params.n_predict );
408
374
break ;
@@ -419,57 +385,14 @@ int main(int argc, char ** argv) {
419
385
420
386
n_past -= n_discard;
421
387
422
- if (ctx_guidance) {
423
- n_past_guidance -= n_discard;
424
- }
425
-
426
- LOG (" after swap: n_past = %d, n_past_guidance = %d\n " , n_past, n_past_guidance);
388
+ LOG (" after swap: n_past = %d\n " , n_past);
427
389
428
390
LOG (" embd: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx, embd).c_str ());
429
391
430
392
}
431
393
432
394
// evaluate tokens in batches
433
395
// embd is typically prepared beforehand to fit within a batch, but not always
434
-
435
- if (ctx_guidance) {
436
- int input_size = 0 ;
437
- llama_token * input_buf = NULL ;
438
-
439
- if (n_past_guidance < (int ) guidance_inp.size ()) {
440
- // Guidance context should have the same data with these modifications:
441
- //
442
- // * Replace the initial prompt
443
- // * Shift everything by guidance_offset
444
- embd_guidance = guidance_inp;
445
- if (embd.begin () + original_prompt_len < embd.end ()) {
446
- embd_guidance.insert (
447
- embd_guidance.end (),
448
- embd.begin () + original_prompt_len,
449
- embd.end ()
450
- );
451
- }
452
-
453
- input_buf = embd_guidance.data ();
454
- input_size = embd_guidance.size ();
455
-
456
- LOG (" guidance context: %s\n " , LOG_TOKENS_TOSTR_PRETTY (ctx, embd_guidance).c_str ());
457
- } else {
458
- input_buf = embd.data ();
459
- input_size = embd.size ();
460
- }
461
-
462
- for (int i = 0 ; i < input_size; i += params.n_batch ) {
463
- int n_eval = std::min (input_size - i, params.n_batch );
464
- if (llama_decode (ctx_guidance, llama_batch_get_one (input_buf + i, n_eval, n_past_guidance, 0 ))) {
465
- LOG_TEE (" %s : failed to eval\n " , __func__);
466
- return 1 ;
467
- }
468
-
469
- n_past_guidance += n_eval;
470
- }
471
- }
472
-
473
396
for (int i = 0 ; i < (int ) embd.size (); i += params.n_batch ) {
474
397
int n_eval = (int ) embd.size () - i;
475
398
if (n_eval > params.n_batch ) {
@@ -491,11 +414,9 @@ int main(int argc, char ** argv) {
491
414
}
492
415
493
416
embd.clear ();
494
- embd_guidance.clear ();
495
417
496
418
if ((int ) embd_inp.size () <= n_consumed && !is_interacting) {
497
-
498
- const llama_token id = llama_sampling_sample (ctx_sampling, ctx, ctx_guidance);
419
+ const llama_token id = llama_sampling_sample (ctx_sampling, ctx, nullptr );
499
420
500
421
llama_sampling_accept (ctx_sampling, ctx, id, true );
501
422
@@ -549,7 +470,6 @@ int main(int argc, char ** argv) {
549
470
550
471
// if not currently processing queued inputs;
551
472
if ((int ) embd_inp.size () <= n_consumed) {
552
-
553
473
// deal with eot token in infill mode
554
474
if ((llama_sampling_last (ctx_sampling) == llama_token_eot (model) || is_interacting) && params.interactive ){
555
475
if (is_interacting && !params.interactive_first ) {
@@ -610,7 +530,6 @@ int main(int argc, char ** argv) {
610
530
embd_inp.insert (embd_inp.end (), inp_sfx.begin (), inp_sfx.end ());
611
531
embd_inp.push_back (llama_token_middle (model));
612
532
embd.clear ();
613
- embd_guidance.clear ();
614
533
n_remain = params.n_predict ;
615
534
n_past = 0 ;
616
535
n_consumed = 0 ;
@@ -717,7 +636,6 @@ int main(int argc, char ** argv) {
717
636
llama_print_timings (ctx);
718
637
write_logfile (ctx, params, model, input_tokens, output_ss.str (), output_tokens);
719
638
720
- if (ctx_guidance) { llama_free (ctx_guidance); }
721
639
llama_free (ctx);
722
640
llama_free_model (model);
723
641
0 commit comments