SparseK: static mask integration in graph build (non-dynamic proof-of-concept)

Gitty Burstein · GittyBurstein · yael-works · Gitty Burstein · commit df59fa27817f · 2025-11-11T00:49:49.000+02:00
Co-authored-by: Gitty Burstein &lt;g0534163997@gmail.com&gt;
Co-authored-by: Yael Shuker &lt;yaelshuker100@gmail.com&gt;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -13,6 +13,15 @@
 #include <cmath>
 #include <cstring>
 
+#include <algorithm>  // std::fill, std::partial_sort, std::max
+#include <vector>
+#include <utility>
+#include <cstdlib>    // getenv, atoi
+
+// forward declaration for debug printing of KQ masks
+static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv,
+                       int64_t n_swa, llama_swa_type swa_type);
+
 void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
     if (ubatch->token) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -261,7 +270,8 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
     }
 }
 
-static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
+static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv,
+                       int64_t n_swa, llama_swa_type swa_type) {
     LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
     const char * swa_type_str = "unknown";
 
@@ -296,6 +306,111 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64
     }
 }
 
+// --- Implementations for llm_graph_input_sparsek_mask (declared in header) ---
+
+llm_graph_input_sparsek_mask::llm_graph_input_sparsek_mask(
+        const llama_hparams & hp,
+        const llama_cparams & cp,
+        const llama_ubatch  & ub,
+        const llama_kv_cache_context * mctx_)
+: hparams(hp), cparams(cp), ubatch(ub), mctx(mctx_) {
+    enabled     = getenv("LLAMA_SPARSEK") != nullptr;
+    win_local   = std::max(0, getenv("LLAMA_SPARSEK_WIN")    ? atoi(getenv("LLAMA_SPARSEK_WIN"))    : 0);
+    stride_glob = std::max(0, getenv("LLAMA_SPARSEK_STRIDE") ? atoi(getenv("LLAMA_SPARSEK_STRIDE")) : 0);
+    topk_static = std::max(0, getenv("LLAMA_SPARSEK_TOPK")   ? atoi(getenv("LLAMA_SPARSEK_TOPK"))   : 0);
+    
+    env_enable_snap = enabled ? 1 : 0;
+    env_win_snap    = win_local;
+    env_stride_snap = stride_glob;
+    env_topk_snap   = topk_static;
+}
+
+void llm_graph_input_sparsek_mask::set_input(const llama_ubatch * ) {
+    if (!enabled || !allow) return;
+
+    GGML_ASSERT(ggml_backend_buffer_is_host(allow->buffer));
+    float * data = (float *) allow->data;
+
+    const int64_t n_stream = allow->ne[3];
+    const int64_t n_rows   = allow->ne[1];
+    const int64_t n_kv     = allow->ne[0];
+
+    std::fill(data, data + ggml_nelements(allow), -INFINITY);
+    GGML_ASSERT(ubatch.pos);
+    GGML_ASSERT(ubatch.n_tokens % n_stream == 0);
+
+    const int64_t n_tps = ubatch.n_tokens / n_stream;
+    for (int64_t s = 0; s < n_stream; ++s) {
+        for (int64_t ii = 0; ii < n_tps; ++ii) {
+            const int64_t i   = s*n_tps + ii;
+            const int64_t row = ii;
+            const int64_t p1  = ubatch.pos[i];
+            float * row_ptr = data + (s*n_rows + row)*n_kv;
+
+            if (win_local > 0) {
+                const int64_t lo = std::max<int64_t>(0,        p1 - win_local);
+                const int64_t hi = std::min<int64_t>(n_kv - 1, p1 + win_local);
+                for (int64_t j = lo; j <= hi; ++j) row_ptr[j] = 0.0f;
+            }
+
+            if (stride_glob > 0) {
+                for (int64_t j = 0; j < n_kv; j += stride_glob) row_ptr[j] = 0.0f;
+            }
+
+            if (topk_static > 0) {
+                const int64_t R   = std::min<int64_t>(n_kv - 1, win_local > 0 ? win_local*4 : 1024);
+                const int64_t lo2 = std::max<int64_t>(0,        p1 - R);
+                const int64_t hi2 = std::min<int64_t>(n_kv - 1, p1 + R);
+
+                std::vector<std::pair<int64_t,int64_t>> cand;
+                cand.reserve(hi2 - lo2 + 1);
+                for (int64_t j = lo2; j <= hi2; ++j)
+                    cand.emplace_back(j, std::llabs((long long)p1 - (long long)j));
+
+                const size_t K = std::min<size_t>(topk_static, cand.size());
+                std::partial_sort(cand.begin(), cand.begin() + K, cand.end(),
+                                  [](auto &a, auto &b){ return a.second < b.second; });
+                for (size_t k = 0; k < K; ++k) row_ptr[cand[k].first] = 0.0f;
+                if (allow) {
+                    last_ne0 = allow->ne[0];  // n_kv
+                    last_ne1 = allow->ne[1];  // n_rows
+                    last_ne3 = allow->ne[3];  // n_stream
+                }
+                env_enable_snap = enabled ? 1 : 0;
+                env_win_snap    = win_local;
+                env_stride_snap = stride_glob;
+                env_topk_snap   = topk_static;
+            }
+        }
+    }
+}
+
+bool llm_graph_input_sparsek_mask::can_reuse(const llm_graph_params & params) {
+    GGML_UNUSED(params);
+
+    if (!allow) return false;
+
+    int cur_enable  = getenv("LLAMA_SPARSEK") ? 1 : 0;
+    int cur_win     = getenv("LLAMA_SPARSEK_WIN")    ? atoi(getenv("LLAMA_SPARSEK_WIN"))    : 0;
+    int cur_stride  = getenv("LLAMA_SPARSEK_STRIDE") ? atoi(getenv("LLAMA_SPARSEK_STRIDE")) : 0;
+    int cur_topk    = getenv("LLAMA_SPARSEK_TOPK")   ? atoi(getenv("LLAMA_SPARSEK_TOPK"))   : 0;
+
+    if (cur_enable != env_enable_snap ||
+        cur_win    != env_win_snap    ||
+        cur_stride != env_stride_snap ||
+        cur_topk   != env_topk_snap) {
+        return false;
+    }
+
+    if (allow->ne[0] != last_ne0 ||
+        allow->ne[1] != last_ne1 ||
+        allow->ne[3] != last_ne3) {
+        return false;
+    }
+
+    return true;
+}
+
 void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
     const int64_t n_kv     = ubatch->n_tokens;
     const int64_t n_tokens = ubatch->n_tokens;
@@ -600,6 +715,37 @@ void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
     }
 }
 
+// ===[ SPARSEK: graph-time mask fusion ]=======================================
+// If disabled, returns base_mask. Otherwise builds an "allow" mask input node
+// and returns base_mask + allow (logical union with 0.0f / -INFINITY encoding)
+// so that blocked (-INF) entries remain blocked and allowed (0.0f) keep base.
+ggml_tensor * llm_graph_context::maybe_apply_sparsek_mask(ggml_tensor * base_mask,
+                                                          int64_t n_kv,
+                                                          int64_t n_rows,
+                                                          int64_t n_stream) const {
+    const bool enabled = getenv("LLAMA_SPARSEK") != nullptr;
+    if (!enabled) return base_mask;
+
+    auto inp = std::make_unique<llm_graph_input_sparsek_mask>(hparams, cparams, ubatch,
+        static_cast<const llama_kv_cache_context *>(mctx));
+
+    auto & allow = inp->allow;
+    allow = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_rows, 1, n_stream);
+    ggml_set_input(allow);
+    res->add_input(std::move(inp));
+
+    ggml_build_forward_expand(gf, allow);
+
+    ggml_tensor * allow_aligned = allow;
+    if (base_mask->type != GGML_TYPE_F32) {
+        allow_aligned = ggml_cast(ctx0, allow, base_mask->type);
+    }
+
+    // Merge by logical union: allowed=0.0f, blocked=-INF
+    ggml_tensor * merged = ggml_add(ctx0, base_mask, allow_aligned);
+    return merged;
+}
+
 ggml_tensor * llm_graph_context::build_cvec(
          ggml_tensor * cur,
                  int   il) const {
@@ -1513,7 +1659,14 @@ ggml_tensor * llm_graph_context::build_attn(
 
     const bool is_swa = hparams.is_swa(il);
 
-    const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+    const auto & kq_mask_base = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+
+    // no-cache: n_kv == n_tokens, stream = 1
+    const int64_t n_kv     = ubatch.n_tokens;
+    const int64_t n_stream = 1;
+    const int64_t n_rows   = GGML_PAD(ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    ggml_tensor * kq_mask = maybe_apply_sparsek_mask(kq_mask_base, n_kv, n_rows, n_stream);
 
     // [TAG_NO_CACHE_PAD]
     // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
@@ -1590,14 +1743,13 @@ ggml_tensor * llm_graph_context::build_attn(
         ggml_tensor * v_mla,
             float     kq_scale,
             int       il) const {
+    const llama_kv_cache_context * mctx_cur = inp->mctx; // define once at top            
     // these nodes are added to the graph together so that they are not reordered
     // by doing so, the number of splits in the graph is reduced
     ggml_build_forward_expand(gf, q_cur);
     ggml_build_forward_expand(gf, k_cur);
     ggml_build_forward_expand(gf, v_cur);
 
-    const auto * mctx_cur = inp->mctx;
-
     // store to KV cache
     {
         const auto & k_idxs = inp->get_k_idxs();
@@ -1607,7 +1759,13 @@ ggml_tensor * llm_graph_context::build_attn(
         ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, v_idxs, il));
     }
 
-    const auto & kq_mask = inp->get_kq_mask();
+    const auto & kq_mask_base = inp->get_kq_mask();
+
+    const int64_t n_kv     = mctx_cur->get_n_kv();
+    const int64_t n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+    const int64_t n_rows   = GGML_PAD(ubatch.n_tokens / n_stream, GGML_KQ_MASK_PAD);
+
+    ggml_tensor * kq_mask = maybe_apply_sparsek_mask(kq_mask_base, n_kv, n_rows, n_stream);
 
     ggml_tensor * q = q_cur;
     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
@@ -1675,12 +1833,20 @@ ggml_tensor * llm_graph_context::build_attn(
     }
 
     const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
+    
+        // --- SparseK: graph-time mask fusion for KV_ISWA ---
+    const int64_t n_kv     = mctx_cur->get_n_kv();
+    const int64_t n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
+    const int64_t n_rows   = GGML_PAD(ubatch.n_tokens / n_stream, GGML_KQ_MASK_PAD);
+
+    ggml_tensor * kq_mask_aug = maybe_apply_sparsek_mask((ggml_tensor *)kq_mask,
+                                                        n_kv, n_rows, n_stream);
 
     ggml_tensor * q = q_cur;
     ggml_tensor * k = mctx_cur->get_k(ctx0, il);
     ggml_tensor * v = mctx_cur->get_v(ctx0, il);
 
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_aug, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
 
     if (wo) {
@@ -1731,11 +1897,19 @@ ggml_tensor * llm_graph_context::build_attn(
 
     const auto & kq_mask = inp->get_kq_mask_cross();
 
+    // --- SparseK: graph-time mask fusion for Cross-Attention ---
+    const int64_t n_kv     = k_cur->ne[0]; // or cross->n_enc, 
+    const int64_t n_stream = 1;
+    const int64_t n_rows   = GGML_PAD(ubatch.n_tokens, GGML_KQ_MASK_PAD);
+
+    ggml_tensor * kq_mask_aug = maybe_apply_sparsek_mask((ggml_tensor *)kq_mask,
+                                                        n_kv, n_rows, n_stream);
+
     ggml_tensor * q = q_cur;
     ggml_tensor * k = k_cur;
     ggml_tensor * v = v_cur;
 
-    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
+    ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_aug, sinks, v_mla, kq_scale, il);
     cb(cur, "kqv_out", il);
 
     if (wo) {
diff --git a/src/llama-graph.h b/src/llama-graph.h
@@ -247,6 +247,52 @@ class llm_graph_input_cross_embd : public llm_graph_input_i {
     const llama_cross * cross;
 };
 
+// ===[ SPARSEK INPUT NODE - DECLARATION ]======================================
+// Provides an "allow-mask" tensor that encodes the SparseK policy
+// (0.0f = allowed, -INFINITY = blocked). The shape must match the KQ mask:
+// [ne0 = n_kv, ne1 = pad(n_tokens_per_stream), ne2 = 1, ne3 = n_stream].
+class llm_graph_input_sparsek_mask : public llm_graph_input_i {
+public:
+    llm_graph_input_sparsek_mask(
+        const llama_hparams & hparams,
+        const llama_cparams & cparams,
+        const llama_ubatch  & ubatch,
+        const llama_kv_cache_context * mctx);
+
+    ~llm_graph_input_sparsek_mask() override = default;
+
+    // Populates the "allow" tensor from ubatch positions based on ENV-driven SparseK policy.
+    // Note: definition is in the .cpp (set_input allocates/fills host-side values).
+    void set_input(const llama_ubatch * ubatch) override;
+
+    // SparseK mask can be reused while the shape/involved streams are unchanged.
+    bool can_reuse(const llm_graph_params & params) override;
+
+    // F32 [n_kv, pad(n_tokens_per_stream), 1, n_stream]
+    ggml_tensor * allow = nullptr;
+
+    // References used to compute the mask
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_ubatch  & ubatch;
+    const llama_kv_cache_context * mctx;
+
+    // ENV-driven controls (read in the .cpp)
+    int  win_local   = 0;
+    int  stride_glob = 0;
+    int  topk_static = 0;
+    bool enabled     = false;
+
+    int64_t last_ne0 = -1;  // n_kv
+    int64_t last_ne1 = -1;  // n_rows (pad(n_tokens_per_stream))
+    int64_t last_ne3 = -1;  // n_stream
+
+    int env_enable_snap  = 0;
+    int env_win_snap     = 0;
+    int env_stride_snap  = 0;
+    int env_topk_snap    = 0;
+};
+
 class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 public:
     llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
@@ -595,6 +641,17 @@ struct llm_graph_context {
     // common
     //
 
+    // Merges a graph-built SparseK "allow" mask into the base KQ mask.
+    // If SparseK is disabled (by ENV), this returns base_mask as-is.
+    // Shapes:
+    //   base_mask : [n_kv, n_rows, 1, n_stream] or [n_tokens, n_rows, 1, n_stream] (no-cache)
+    //   allow     : [n_kv, n_rows, 1, n_stream]
+    // Returned tensor has the same shape as base_mask.
+    ggml_tensor * maybe_apply_sparsek_mask(ggml_tensor * base_mask,
+                                        int64_t n_kv,
+                                        int64_t n_rows,
+                                        int64_t n_stream) const;
+
     ggml_tensor * build_cvec(
              ggml_tensor * cur,
                      int   il) const;