2
2
#include " common.h"
3
3
#include " llama.h"
4
4
5
+ #include < algorithm>
5
6
#include < cmath>
6
7
#include < cstdio>
7
8
#include < cstring>
@@ -321,12 +322,17 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
321
322
const int n_batch = params.n_batch ;
322
323
323
324
llama_batch batch = llama_batch_get_one (NULL , 0 , 0 , 0 );
325
+
326
+ const int32_t n_layers = 32 ;
327
+ const int test_count = 15 ;
324
328
std::vector<int32_t > layers;
325
- const int32_t n_layers = 26 ;
326
329
layers.resize (n_layers + 1 );
327
330
std::iota (layers.begin (), layers.end (), 0 );
328
331
batch.run_layers = layers.data ();
329
- int32_t skip_layer = 0 ;
332
+ int32_t skip_layer = -1 ;
333
+ std::vector<int32_t > skips;
334
+ int32_t curr_best_layer = -1 ;
335
+ double curr_best_ppl = -1 , ref_ppl = -1 ;
330
336
331
337
int count = 0 ;
332
338
double nll = 0.0 ;
@@ -337,22 +343,44 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
337
343
std::vector<std::thread> workers (std::thread::hardware_concurrency () - 1 );
338
344
339
345
for (int i = 0 ; i < n_chunk; ++i) {
340
- if (i > 0 && i % 20 == 0 ) {
341
- if (skip_layer >= n_layers) break ;
346
+ if (i > 0 && i % test_count == 0 ) {
347
+ for (int32_t new_sl = skip_layer + 1 ; new_sl <= n_layers; new_sl++) {
348
+ if (std::find (skips.begin (), skips.end (), new_sl) != skips.end ()) continue ;
349
+ skip_layer = new_sl;
350
+ break ;
351
+ }
352
+ if (skip_layer >= n_layers) {
353
+ if (curr_best_layer == -1 ) break ;
354
+ printf (" \n\n ADD SKIP %3d - ppl vs ref %.4f" , curr_best_layer, curr_best_ppl - ref_ppl);
355
+ if (curr_best_ppl >= ref_ppl * 5 ) break ;
356
+ skips.push_back (curr_best_layer);
357
+ curr_best_layer = -1 ;
358
+ curr_best_ppl = -1 ;
359
+ skip_layer = -1 ;
360
+ for (int32_t new_sl = skip_layer + 1 ; new_sl <= n_layers; new_sl++) {
361
+ if (std::find (skips.begin (), skips.end (), new_sl) != skips.end ()) continue ;
362
+ skip_layer = new_sl;
363
+ break ;
364
+ }
365
+ if (skip_layer == -1 || skip_layer == n_layers) break ;
366
+ }
342
367
i = 0 ;
343
368
count = 0 ;
344
369
nll = 0 ;
345
370
nll2 = 0 ;
346
371
logit_history.clear ();
347
372
prob_history.clear ();
348
373
349
- for (int32_t i = 0 , ic = 0 ; i < n_layers; i++) {
350
- if (i == skip_layer) continue ;
374
+ int32_t ic = 0 ;
375
+ for (int32_t i = 0 ; i < n_layers; i++) {
376
+ if (i == skip_layer || std::find (skips.begin (), skips.end (), i) != skips.end ()) continue ;
351
377
layers[ic++] = i;
352
378
}
353
- layers[n_layers - 1 ] = -1 ; // we skipped 1
354
- printf (" \n SKIPPING: %d\n " , skip_layer);
355
- skip_layer++;
379
+ if (ic == 0 ) break ;
380
+ layers[ic] = -1 ;
381
+ printf (" \n SKIP %3d + [" , skip_layer);
382
+ for (const auto l : skips) printf (" %d," , l);
383
+ printf (" ] - len: %3zu, best:(%3d: %.3f)\n " , skips.size () + 1 , curr_best_layer, curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0 );
356
384
}
357
385
const int start = i * n_ctx;
358
386
const int end = start + n_ctx;
@@ -396,7 +424,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
396
424
397
425
const auto t_end = std::chrono::high_resolution_clock::now ();
398
426
399
- if (i == 0 && skip_layer == 0 ) {
427
+ if (i == 0 && skip_layer < 0 && skips. empty () ) {
400
428
const float t_total = std::chrono::duration<float >(t_end - t_start).count ();
401
429
fprintf (stderr, " %s: %.2f seconds per pass - ETA " , __func__, t_total);
402
430
int total_seconds = (int )(t_total * n_chunk);
@@ -425,15 +453,24 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
425
453
count += n_ctx - first - 1 ;
426
454
427
455
// perplexity is e^(average negative log-likelihood)
428
- if (params.ppl_output_type == 0 ) {
429
- printf (" [%d]%.4lf," , i + 1 , std::exp (nll / count));
430
- } else {
431
- double av = nll/count;
432
- double av2 = nll2/count - av*av;
433
- if (av2 > 0 ) av2 = sqrt (av2/(count-1 ));
434
- printf (" %8d %.4lf %4lf %4lf\n " , i*n_ctx, std::exp (nll / count), av, av2);
456
+ // if (params.ppl_output_type == 0) {
457
+ // printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
458
+ // } else {
459
+ // double av = nll/count;
460
+ // double av2 = nll2/count - av*av;
461
+ // if (av2 > 0) av2 = sqrt(av2/(count-1));
462
+ // printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
463
+ // }
464
+ // fflush(stdout);
465
+ if (skip_layer >= 0 && i + 1 == test_count) {
466
+ double ppl = std::exp (nll / count);
467
+ if (curr_best_layer == -1 || ppl < curr_best_ppl) {
468
+ curr_best_layer = skip_layer;
469
+ curr_best_ppl = ppl;
470
+ }
471
+ } else if (skip_layer < 0 ) {
472
+ ref_ppl = std::exp (nll / count);
435
473
}
436
- fflush (stdout);
437
474
}
438
475
printf (" \n " );
439
476
0 commit comments