Iteratively skip the layer with the least impact

KerfuffleV2 · KerfuffleV2 · commit 599ccdaaefd7 · 2023-10-17T11:45:48.000-06:00
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "llama.h"
 
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
@@ -321,12 +322,17 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     const int n_batch = params.n_batch;
 
     llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0);
+
+    const int32_t n_layers = 32;
+    const int test_count = 15;
     std::vector<int32_t> layers;
-    const int32_t n_layers = 26;
     layers.resize(n_layers + 1);
     std::iota(layers.begin(), layers.end(), 0);
     batch.run_layers = layers.data();
-    int32_t skip_layer = 0;
+    int32_t skip_layer = -1;
+    std::vector<int32_t> skips;
+    int32_t curr_best_layer = -1;
+    double curr_best_ppl = -1, ref_ppl = -1;
 
     int count = 0;
     double nll = 0.0;
@@ -337,22 +343,44 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
     for (int i = 0; i < n_chunk; ++i) {
-        if (i > 0 && i % 20 == 0) {
-            if (skip_layer >= n_layers) break;
+        if (i > 0 && i % test_count == 0) {
+            for (int32_t new_sl = skip_layer + 1; new_sl <= n_layers; new_sl++) {
+                if (std::find(skips.begin(), skips.end(), new_sl) != skips.end()) continue;
+                skip_layer = new_sl;
+                break;
+            }
+            if (skip_layer >= n_layers) {
+                if (curr_best_layer == -1) break;
+                printf("\n\nADD SKIP %3d - ppl vs ref %.4f", curr_best_layer, curr_best_ppl - ref_ppl);
+                if (curr_best_ppl >= ref_ppl * 5) break;
+                skips.push_back(curr_best_layer);
+                curr_best_layer = -1;
+                curr_best_ppl = -1;
+                skip_layer = -1;
+                for (int32_t new_sl = skip_layer + 1; new_sl <= n_layers; new_sl++) {
+                    if (std::find(skips.begin(), skips.end(), new_sl) != skips.end()) continue;
+                    skip_layer = new_sl;
+                    break;
+                }
+                if (skip_layer == -1 || skip_layer == n_layers) break;
+            }
             i = 0;
             count = 0;
             nll = 0;
             nll2 = 0;
             logit_history.clear();
             prob_history.clear();
 
-            for (int32_t i = 0, ic = 0; i < n_layers; i++) {
-                if (i == skip_layer) continue;
+            int32_t ic = 0;
+            for (int32_t i = 0; i < n_layers; i++) {
+                if (i == skip_layer || std::find(skips.begin(), skips.end(), i) != skips.end()) continue;
                 layers[ic++] = i;
             }
-            layers[n_layers - 1] = -1; // we skipped 1
-            printf("\nSKIPPING: %d\n", skip_layer);
-            skip_layer++;
+            if (ic == 0) break;
+            layers[ic] = -1;
+            printf("\nSKIP %3d + [", skip_layer);
+            for (const auto l : skips) printf("%d,", l);
+            printf("] - len: %3zu, best:(%3d: %.3f)\n", skips.size() + 1, curr_best_layer, curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0);
         }
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
@@ -396,7 +424,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0 && skip_layer == 0) {
+        if (i == 0 && skip_layer < 0 && skips.empty()) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -425,15 +453,24 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         count += n_ctx - first - 1;
 
         // perplexity is e^(average negative log-likelihood)
-        if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        } else {
-            double av = nll/count;
-            double av2 = nll2/count - av*av;
-            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+        // if (params.ppl_output_type == 0) {
+        //     printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        // } else {
+        //     double av = nll/count;
+        //     double av2 = nll2/count - av*av;
+        //     if (av2 > 0) av2 = sqrt(av2/(count-1));
+        //     printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+        // }
+        // fflush(stdout);
+        if (skip_layer >= 0 && i + 1 == test_count) {
+            double ppl = std::exp(nll / count);
+            if (curr_best_layer == -1 || ppl < curr_best_ppl) {
+                curr_best_layer = skip_layer;
+                curr_best_ppl = ppl;
+            }
+        } else if (skip_layer < 0) {
+            ref_ppl = std::exp(nll / count);
         }
-        fflush(stdout);
     }
     printf("\n");