Fixed CB bug for SwiftKV

ochougul · ochougul · commit 013e4b7dbe48 · 2025-03-25T22:22:59.000+05:30
Signed-off-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
@@ -62,7 +62,7 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
                     self.value_cache[layer_idx], position_ids, value_states
                 )
 
-    def read_only(self, layer_idx, **cache_kwargs):
+    def read_only(self, layer_idx, cache_kwargs):
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
         batch_index = cache_kwargs.get("batch_index", None)
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -94,7 +94,6 @@ def forward(
     ) -> torch.Tensor:
         bsz, q_len, _ = hidden_states.size()
         query = self.q_proj_swiftkv(hidden_states)
-
         # Reshape the query, key, and value tensors.
         query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
 
@@ -107,10 +106,9 @@ def forward(
                     "with a layer index."
                 )
             kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cache_kwargs = {"position_ids": position_ids, "batch_index": batch_index}
+        key_states, value_states = past_key_value.read_only(self.layer_idx, cache_kwargs=cache_kwargs)
 
-        key_states, value_states = past_key_value.read_only(
-            self.layer_idx, position_ids=position_ids, batch_index=batch_index
-        )
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         position_ids = position_ids[torch.arange(bsz), position_ids.to(torch.int32).argmax(1)].unsqueeze(1)
         query_states, _ = qeff_apply_rotary_pos_emb(
@@ -121,10 +119,8 @@ def forward(
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
         if attention_mask is not None:  # no matter the length, we just slice it
             attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
-
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
@@ -148,7 +144,6 @@ def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -343,7 +338,6 @@ def forward(
 
         bsz, q_len, _ = hidden_states.size()
         swiftkv_hidden_states = self.norm_swiftkv(hidden_states)
-
         ####################################
         ## THE MAGIC OF SWIFT KV BEGINS HERE
         ####################################
@@ -374,24 +368,30 @@ def forward(
         last_pos_id = position_ids.to(torch.int32).argmax(1, keepdim=True)
         orig_hidden_states = hidden_states
 
-        hidden_states = orig_hidden_states[torch.arange(bsz), last_pos_id, :]
-
-        causal_mask = causal_mask[torch.arange(bsz), :, last_pos_id, :]
+        # Extracting only the last valid position id to be processed by self-attn of half of the layers, as KV cache is already filled.
+        if batch_index is not None:
+            hidden_states = orig_hidden_states[batch_index, last_pos_id, :]
+            causal_mask = causal_mask[batch_index, :, last_pos_id, :]
+        else:
+            hidden_states = orig_hidden_states[torch.arange(bsz), last_pos_id, :]
+            causal_mask = causal_mask[torch.arange(bsz), :, last_pos_id, :]
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask, batch_index
         )
-
-        orig_hidden_states[torch.arange(bsz), last_pos_id, :] = hidden_states
+        # We can fill the orig_hidden_states with the processed hidden_states here but it's not needed as for next token prediction
+        # we only need the last valid pos_indices hidden_states.
+        # Here the shape of hiden_states is [batch_size, 1, hidden_dim] instead of [batch_size, seq_len, hidden_dim]
+        # This saves un-necessary data movement on devices.
         ####################################
         ## THE MAGIC OF SWIFT KV ENDS HERE
         ####################################
 
         next_cache = next_decoder_cache.to_legacy_cache()
-        return orig_hidden_states, next_cache
+        return hidden_states, next_cache
 
 
-class LlamaSwiftKVForCausalLM(PreTrainedModel):
+class LlamaSwiftKVForCausalLM(PreTrainedModel):  #
     config_class = LlamaSwiftKVConfig
 
     def __init__(self, config: LlamaSwiftKVConfig):
@@ -412,8 +412,6 @@ def forward(
         batch_index: Optional[torch.LongTensor] = None,
     ):
         hidden_states, output_past_key_values = self.model(input_ids, position_ids, past_key_values, batch_index)
-        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
-        hidden_states = hidden_states[torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
         logits = self.lm_head(hidden_states)
         return CausalLMOutputWithPast(
             loss=None,
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -8,6 +8,8 @@
 import os
 from typing import Optional
 
+import numpy as np
+
 import pytest
 from transformers import AutoModelForCausalLM
 
@@ -123,17 +125,18 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
 
     # testing for CB models
     model_hf, _ = load_causal_lm_model(model_config)
+    config = model_hf.config
     full_batch_size = 4
     fbs_prompts = Constants.INPUT_STR * 4
-    api_runner = ApiRunner(
-        batch_size,
-        tokenizer,
-        config,
-        fbs_prompts,
-        Constants.PROMPT_LEN,
-        Constants.CTX_LEN,
-        full_batch_size,
-    )
+    # api_runner = ApiRunner(
+    #     batch_size,
+    #     tokenizer,
+    #     config,
+    #     fbs_prompts,
+    #     Constants.PROMPT_LEN,
+    #     Constants.CTX_LEN,
+    #     full_batch_size,
+    # )
 
     # pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
     # pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):`
`62`	`62`	`self.value_cache[layer_idx], position_ids, value_states`
`63`	`63`	`)`
`64`	`64`
`65`		`- def read_only(self, layer_idx, **cache_kwargs):`
	`65`	`+ def read_only(self, layer_idx, cache_kwargs):`
`66`	`66`	`k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]`
`67`	`67`	`position_ids = cache_kwargs.get("position_ids")`
`68`	`68`	`batch_index = cache_kwargs.get("batch_index", None)`