Integrate SparseK attention mask support and cleanup Co-authored-by: Yael <yaelshuker100@gmail.com>

Gitty Burstein · Gitty Burstein · commit 7c5f85a4bd5e · 2025-11-05T13:23:51.000+02:00
Co-authored-by: Gitty &lt;g0534163997@gmail.com&gt;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -1300,6 +1300,75 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
                     data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
                 }
+                                // ===== SparseK (minimal diff): local window + stride =====
+                // Control parameters via environment variables (no CLI / API changes):
+                auto env_i = [](const char *name, int def)->int {
+                    if (const char *s = std::getenv(name)) { return std::max(0, std::atoi(s)); }
+                    return def;
+                };
+                auto env_b = [](const char *name, bool def)->bool {
+                    if (const char *s = std::getenv(name)) { return std::atoi(s) != 0; }
+                    return def;
+                };
+
+                // Enable SparseK (0=off, 1=on) + parameters
+                const bool enable_sparsek  = env_b("LLAMA_SPARSEK_ENABLE", false);
+                const int  win_local       = env_i("LLAMA_SPARSEK_WIN",    64);   // half-window around i
+                const int  stride_g        = env_i("LLAMA_SPARSEK_STRIDE", 128);  // global stride step
+                const bool en_local        = env_b("LLAMA_SPARSEK_ENABLE_LOCAL",  true);
+                const bool en_stride       = env_b("LLAMA_SPARSEK_ENABLE_STRIDE", true);
+
+                // Apply SparseK sparsity to the already-built mask.
+                // Everything outside the SparseK policy will be forced to -INF.
+                if (enable_sparsek && (en_local || en_stride)) {
+                    for (uint32_t s = 0; s < n_stream; ++s) {
+                        for (uint32_t ii = 0; ii < n_tps; ++ii) {
+                            const uint32_t i = s*n_tps + ii;
+
+                            // Row base index in the flat mask tensor
+                            const uint64_t idst = n_kv*(/*h=*/0*n_stream*n_tps_pad + s*n_tps_pad + ii);
+                            float * row = data + idst;
+
+                            // Build "allow" mask: 1 = allowed, 0 = pruned
+                            std::vector<uint8_t> allow(n_kv, 0);
+
+                            // 1) Local window
+                            if (en_local && win_local > 0) {
+                                const int j0 = std::max<int>(0,          int(i) - win_local);
+                                const int j1 = std::min<int>(int(n_kv)-1,int(i) + win_local);
+                                for (int j = j0; j <= j1; ++j) allow[j] = 1;
+                            }
+
+                            // 2) Global stride: backward only for causal; both directions if non-causal
+                            if (en_stride && stride_g > 0) {
+                                for (int j = int(i); j >= 0; j -= stride_g) allow[j] = 1;
+                                if (!causal_attn) {
+                                    for (int j = int(i); j < int(n_kv); j += stride_g) allow[j] = 1;
+                                }
+                            }
+
+                            // 3) Apply pruning: outside "allow" → -INF; inside → keep existing or set 0.0f
+                            bool any_allowed = false;
+                            for (int64_t j = 0; j < n_kv; ++j) {
+                                if (allow[j]) {
+                                    if (std::isinf(row[j]) && row[j] < 0.0f) {
+                                        row[j] = 0.0f; // release from -INF if previously forbidden
+                                    }
+                                    any_allowed = true;
+                                } else {
+                                    row[j] = -INFINITY; // enforce sparsity
+                                }
+                            }
+
+                            // Safety: make sure the row is not completely empty (avoid NaN in Softmax)
+                            if (!any_allowed) {
+                                const int64_t jj = std::min<int64_t>(i, n_kv - 1);
+                                row[jj] = 0.0f;
+                            }
+                        }
+                    }
+                }
+                // ===== end SparseK minimal =====
             }
         }
     }