Allow independent skipping of attention and MLP

KerfuffleV2 · KerfuffleV2 · commit d3c08ea0d682 · 2023-10-17T11:47:15.000-06:00
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -323,11 +323,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
     llama_batch batch = llama_batch_get_one(NULL, 0, 0, 0);
 
-    const int32_t n_layers = 32;
+    const int32_t n_layers = 26;
     const int test_count = 15;
+    // 1 = attn, 2 = mlp, 3 = both
+    int32_t test_skip_type = 1;
     std::vector<int32_t> layers;
     layers.resize(n_layers + 1);
-    std::iota(layers.begin(), layers.end(), 0);
+    std::fill(layers.begin(), layers.end(), 0);
     batch.run_layers = layers.data();
     int32_t skip_layer = -1;
     std::vector<int32_t> skips;
@@ -342,9 +344,12 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
+    auto test_t_start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < n_chunk; ++i) {
         if (i > 0 && i % test_count == 0) {
-            for (int32_t new_sl = skip_layer + 1; new_sl <= n_layers; new_sl++) {
+            auto test_t_end = std::chrono::high_resolution_clock::now();
+            float test_t_total = std::chrono::duration<float>(test_t_end - test_t_start).count();
+            for (int32_t new_sl = std::max(0, skip_layer + 1); new_sl <= n_layers ; new_sl++) {
                 if (std::find(skips.begin(), skips.end(), new_sl) != skips.end()) continue;
                 skip_layer = new_sl;
                 break;
@@ -371,16 +376,22 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             logit_history.clear();
             prob_history.clear();
 
-            int32_t ic = 0;
             for (int32_t i = 0; i < n_layers; i++) {
-                if (i == skip_layer || std::find(skips.begin(), skips.end(), i) != skips.end()) continue;
-                layers[ic++] = i;
+                if (i == skip_layer || std::find(skips.begin(), skips.end(), i) != skips.end()) {
+                    layers[i] = test_skip_type;
+                } else {
+                    layers[i] = 0;
+                }
             }
-            if (ic == 0) break;
-            layers[ic] = -1;
+            layers[n_layers] = -1;
             printf("\nSKIP %3d + [", skip_layer);
             for (const auto l : skips) printf("%d,", l);
-            printf("] - len: %3zu, best:(%3d: %.3f)\n", skips.size() + 1, curr_best_layer, curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0);
+            printf("] - len: %3zu, best:(%3d: %.3f), took %.2f sec\n",
+                skips.size() + 1,
+                curr_best_layer,
+                curr_best_ppl != -1 ? curr_best_ppl - ref_ppl : 0,
+                test_t_total);
+            test_t_start = std::chrono::high_resolution_clock::now();
         }
         const int start =     i * n_ctx;
         const int end   = start + n_ctx;
@@ -453,15 +464,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         count += n_ctx - first - 1;
 
         // perplexity is e^(average negative log-likelihood)
-        // if (params.ppl_output_type == 0) {
-        //     printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        // } else {
-        //     double av = nll/count;
-        //     double av2 = nll2/count - av*av;
-        //     if (av2 > 0) av2 = sqrt(av2/(count-1));
-        //     printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
-        // }
-        // fflush(stdout);
+        if (params.ppl_output_type == 0) {
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            double av = nll/count;
+            double av2 = nll2/count - av*av;
+            if (av2 > 0) av2 = sqrt(av2/(count-1));
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+        }
+        fflush(stdout);
         if (skip_layer >= 0 && i + 1 == test_count) {
             double ppl = std::exp(nll / count);
             if (curr_best_layer == -1 || ppl < curr_best_ppl) {
diff --git a/llama.cpp b/llama.cpp
@@ -2975,8 +2975,6 @@ static struct ggml_cgraph * llm_build_llama(
 
     const auto & kv_self = lctx.kv_self;
 
-    int32_t * run_layer = batch.run_layers;
-
     GGML_ASSERT(!!kv_self.ctx);
 
     const int64_t n_embd      = hparams.n_embd;
@@ -3132,12 +3130,27 @@ static struct ggml_cgraph * llm_build_llama(
         }
     }
 
-    for (int il_ = 0; il_ < n_layer; ++il_) {
-        int il = il_;
+    int32_t * run_layer = batch.run_layers;
+    bool run_attn = false, run_mlp = false;
+    cur = inpL;
+
+    for (int il = 0; il < n_layer; ++il) {
+        run_attn = run_mlp = true;
         if (run_layer != NULL) {
-            il = *run_layer++;
-            if (il < 0) break;
+            if (*run_layer >= 0) {
+                run_attn = (*run_layer & 1) == 0;
+                run_mlp  = (*run_layer & 2) == 0;
+                run_layer++;
+            } else {
+                run_layer = NULL;
+            }
+        } else if (ggml_allocr_is_measure(lctx.alloc) && il == n_layer - 1) {
+            // No idea why this is needed, but otherwise we run out of space
+            // when skipping attn or mlp (but not both) on the last layer
+            run_mlp = false;
         }
+        if (!run_attn && !run_mlp) continue;
+
         ggml_format_name(inpL, "layer_inp_%d", il);
 
         offload_func_t offload_func = llama_nop;
@@ -3148,10 +3161,11 @@ static struct ggml_cgraph * llm_build_llama(
         }
 #endif // GGML_USE_CUBLAS
 
-        struct ggml_tensor * inpSA = inpL;
+        struct ggml_tensor * inpFF = nullptr;
 
-        // norm
-        {
+        // self-attention
+        if (run_attn) {
+            // norm
             cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
             offload_func(cur);
             ggml_set_name(cur, "rms_norm_0");
@@ -3160,10 +3174,7 @@ static struct ggml_cgraph * llm_build_llama(
             cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
             offload_func(cur);
             ggml_set_name(cur, "attention_norm_0");
-        }
 
-        // self-attention
-        {
             // compute Q and K and RoPE them
             struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
             offload_func_kq(tmpk);
@@ -3280,25 +3291,25 @@ static struct ggml_cgraph * llm_build_llama(
                     cur);
             offload_func(cur);
             ggml_set_name(cur, "result_wo");
-        }
 
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
-        offload_func(inpFF);
-        ggml_set_name(inpFF, "inpFF");
+            inpFF = ggml_add(ctx0, cur, inpL);
+            offload_func(inpFF);
+            ggml_set_name(inpFF, "inpFF");
+        } else {
+            inpFF = inpL;
+        }
 
         // feed-forward network
-        {
+        if (run_mlp) {
             // norm
-            {
-                cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
-                offload_func(cur);
-                ggml_set_name(cur, "rms_norm_1");
+            cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
+            offload_func(cur);
+            ggml_set_name(cur, "rms_norm_1");
 
-                // cur = cur*ffn_norm(broadcasted)
-                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
-                offload_func(cur);
-                ggml_set_name(cur, "ffn_norm");
-            }
+            // cur = cur*ffn_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
+            offload_func(cur);
+            ggml_set_name(cur, "ffn_norm");
 
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
@@ -3326,18 +3337,18 @@ static struct ggml_cgraph * llm_build_llama(
                     cur);
             offload_func(cur);
             ggml_set_name(cur, "result_w2");
-        }
 
-        cur = ggml_add(ctx0, cur, inpFF);
-        offload_func(cur);
-        ggml_set_name(cur, "inpFF_+_result_w2");
+            cur = ggml_add(ctx0, cur, inpFF);
+            offload_func(cur);
+            ggml_set_name(cur, "inpFF_+_result_w2");
+        } else {
+            cur = inpFF;
+        }
 
         // input for next layer
         inpL = cur;
     }
 
-    cur = inpL;
-
     // norm
     {
         cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
@@ -9351,7 +9362,6 @@ void llama_batch_free(struct llama_batch batch) {
     if (batch.pos)    free(batch.pos);
     if (batch.seq_id) free(batch.seq_id);
     if (batch.logits) free(batch.logits);
-    if (batch.run_layers) free(batch.run_layers);
 }
 
 int llama_decode(