attention_type

liangel-02 · liangel-02 · commit caafc818ea60 · 2025-11-18T12:37:58.000-08:00
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -55,18 +55,32 @@ class VarlenAttentionWrapper(torch.nn.Module):
 
     def forward(
         self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        cu_seq_q: torch.Tensor,
-        cu_seq_k: torch.Tensor,
-        max_q: int,
-        max_k: int,
+        xq: torch.Tensor,
+        xk: torch.Tensor,
+        xv: torch.Tensor,
+        head_dim: torch.Tensor,
+        attention_masks: VarlenMetadata,
         is_causal: bool = True,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        cu_seq_q = attention_masks.cu_seq_q
+        cu_seq_k = attention_masks.cu_seq_k
+        max_q = attention_masks.max_q
+        max_k = attention_masks.max_k
+
+        n_local_heads = xq.shape[1]
+        xq_packed = xq.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
+        xk_packed = xk.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
+        xv_packed = xv.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
 
         return VarlenAttentionWrapper._compiled_varlen_attn(
-            q, k, v, cu_seq_q, cu_seq_k, max_q, max_k, is_causal=True
+            xq_packed,
+            xk_packed,
+            xv_packed,
+            cu_seq_q,
+            cu_seq_k,
+            max_q,
+            max_k,
+            is_causal=True,
         )
 
 
@@ -104,7 +118,6 @@ def forward(
         #    `FlexAttentionWrapper._compiled_flex_attn` is correct.
         # 3. Used `return_lse` instead of `return_aux` because of easier TP module notation
         #    to convert `lse` to be DTensor.
-
         return FlexAttentionWrapper._compiled_flex_attn(
             q,
             k,
@@ -266,7 +279,9 @@ def create_attention_mask(*args, **kwargs):
     return _compiled_create_block_mask(*args, **kwargs)
 
 
-def create_varlen_cu_seqs(input_batch: torch.Tensor, eos_id: int) -> VarlenMetadata:
+def create_varlen_metadata_for_document(
+    input_batch: torch.Tensor, eos_id: int
+) -> VarlenMetadata:
     """
     Creates cumulative sequence length indices needed for variable length attention
 
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -36,7 +36,7 @@
         n_heads=16,
         vocab_size=2048,
         rope_theta=500000,
-        use_flex_attn=True,
+        attention_type="flex",
         attn_mask_type="block_causal",
     ),
     "debugmodel_varlen_attn": TransformerModelArgs(
@@ -45,7 +45,7 @@
         n_heads=16,
         vocab_size=2048,
         rope_theta=500000,
-        use_varlen_attn=True,
+        attention_type="varlen",
     ),
     "8B": TransformerModelArgs(
         dim=4096,
@@ -64,7 +64,7 @@
         ffn_dim_multiplier=1.3,
         multiple_of=1024,
         rope_theta=500000,
-        use_flex_attn=True,
+        attention_type="flex",
         attn_mask_type="block_causal",
     ),
     "8B_varlen": TransformerModelArgs(
@@ -75,7 +75,7 @@
         ffn_dim_multiplier=1.3,
         multiple_of=1024,
         rope_theta=500000,
-        use_varlen_attn=True,
+        attention_type="varlen",
     ),
     "70B": TransformerModelArgs(
         dim=8192,
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -67,7 +67,9 @@ def parallelize_llama(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
+    # use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
+    attn_type = getattr(model.model_args, "attention_type", False)
+    use_flex_attn = attn_type == "flex"
     if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
@@ -9,8 +9,9 @@
 
 from dataclasses import dataclass, field
 
-from torch import nn
+from typing import Literal
 
+from torch import nn
 from torchtitan.config import JobConfig
 from torchtitan.models.utils import get_dense_model_nparams_and_flops
 from torchtitan.protocols.model import BaseModelArgs
@@ -43,8 +44,8 @@ class TransformerModelArgs(BaseModelArgs):
     # `False`, each uses the total number of transformer blocks
     depth_init: bool = True
 
-    use_flex_attn: bool = False
-    use_varlen_attn: bool = False
+    attention_type: Literal["flex", "varlen"] = None
+    # use_flex_attn: bool = True
     attn_mask_type: str = "causal"
     eos_id: int = 0
 
@@ -56,7 +57,10 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
             )
         self.max_seq_len = seq_len
 
-        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
+        if (
+            job_config.parallelism.context_parallel_degree > 1
+            and self.attention_type == "flex"
+        ):
             raise NotImplementedError(
                 "CP support for FlexAttention is still in progress."
             )
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -16,12 +16,13 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.models.attention import (
     create_attention_mask,
-    create_varlen_cu_seqs,
+    create_varlen_metadata_for_document,
     FlexAttentionWrapper,
     get_causal_mask_mod,
     get_document_mask_mod,
     ScaledDotProductAttentionWrapper,
     VarlenAttentionWrapper,
+    VarlenMetadata,
 )
 from torchtitan.protocols.model import AttentionMasksType
 from torchtitan.protocols.train_spec import ModelProtocol
@@ -193,8 +194,8 @@ def __init__(self, model_args: TransformerModelArgs):
             model_args.n_heads * self.head_dim, model_args.dim, bias=False
         )
 
-        self.use_flex_attn = model_args.use_flex_attn
-        self.use_varlen_attn = model_args.use_varlen_attn
+        self.use_flex_attn = model_args.attention_type == "flex"
+        self.use_varlen_attn = model_args.attention_type == "varlen"
         if self.use_flex_attn:
             self.inner_attention = FlexAttentionWrapper()
         elif self.use_varlen_attn:
@@ -212,7 +213,6 @@ def forward(
         x: torch.Tensor,
         freqs_cis: torch.Tensor,
         attention_masks: AttentionMasksType | None,
-        **kwargs,
     ):
         """
         Forward pass of the attention module.
@@ -250,30 +250,13 @@ def forward(
             assert isinstance(attention_masks, BlockMask), attention_masks
             output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
         elif self.use_varlen_attn:
-            cu_seq_q = attention_masks.cu_seq_q
-            cu_seq_k = attention_masks.cu_seq_k
-            max_q = attention_masks.max_q
-            max_k = attention_masks.max_k
-
-            n_local_heads = xq.shape[1]
-            xq_packed = (
-                xq.transpose(1, 2).contiguous().view(-1, n_local_heads, self.head_dim)
-            )
-            xk_packed = (
-                xk.transpose(1, 2).contiguous().view(-1, n_local_heads, self.head_dim)
-            )
-            xv_packed = (
-                xv.transpose(1, 2).contiguous().view(-1, n_local_heads, self.head_dim)
-            )
-
+            assert isinstance(attention_masks, VarlenMetadata), attention_masks
             output = self.inner_attention(
-                xq_packed,
-                xk_packed,
-                xv_packed,
-                cu_seq_q,
-                cu_seq_k,
-                max_q,
-                max_k,
+                xq,
+                xk,
+                xv,
+                self.head_dim,
+                attention_masks,
                 is_causal=True,
             )
         else:
@@ -375,7 +358,6 @@ def forward(
         x: torch.Tensor,
         freqs_cis: torch.Tensor,
         attention_masks: AttentionMasksType | None,
-        **kwargs,
     ):
         """
         Perform a forward pass through the TransformerBlock.
@@ -388,9 +370,7 @@ def forward(
             torch.Tensor: Output tensor after applying attention and feedforward layers.
 
         """
-        h = x + self.attention(
-            self.attention_norm(x), freqs_cis, attention_masks, **kwargs
-        )
+        h = x + self.attention(self.attention_norm(x), freqs_cis, attention_masks)
         out = h + self.feed_forward(self.ffn_norm(h))
         return out
 
@@ -485,34 +465,61 @@ def _precompute_freqs_cis(self) -> torch.Tensor:
             self.model_args.rope_scaling_args,
         )
 
-    def get_attention_masks(
+    def _get_flex_attention_masks(
         self,
         input_batch: torch.Tensor,
-        tokenizer: BaseTokenizer,
+        eos_id: int,
         extra_inputs: dict[str, torch.Tensor] | None = None,
     ) -> AttentionMasksType:
         mask_mods = [get_causal_mask_mod()]
-        if self.model_args.use_varlen_attn:
-            return create_varlen_cu_seqs(input_batch, tokenizer.eos_id)
+
         match self.model_args.attn_mask_type:
             case "causal":
                 B = 1
             case "block_causal":
                 B = input_batch.shape[0]
-                mask_mods.append(get_document_mask_mod(input_batch, tokenizer.eos_id))
+                mask_mods.append(get_document_mask_mod(input_batch, eos_id))
             case _:
                 raise ValueError(
                     f"Unknown attention mask type: {self.model_args.attn_mask_type}"
                 )
+
         return create_attention_mask(
             and_masks(*mask_mods), B, None, input_batch.shape[1], input_batch.shape[1]
         )
 
+    def _get_varlen_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        eos_id: int,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        return create_varlen_metadata_for_document(input_batch, eos_id)
+
+    def get_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        tokenizer: BaseTokenizer,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        match self.model_args.attention_type:
+            case "flex":
+                return self._get_flex_attention_masks(
+                    input_batch, tokenizer.eos_id, extra_inputs
+                )
+            case "varlen":
+                return self._get_varlen_attention_masks(
+                    input_batch, tokenizer.eos_id, extra_inputs
+                )
+            case _:
+                raise NotImplementedError(
+                    "Only varlen and flex attn masks are supported"
+                )
+
     def forward(
         self,
         tokens: torch.Tensor,
         attention_masks: AttentionMasksType | None = None,
-        **kwargs,
     ):
         """
         Perform a forward pass through the Transformer model.
@@ -531,8 +538,7 @@ def forward(
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
 
         for layer in self.layers.values():
-            h = layer(h, self.freqs_cis, attention_masks=attention_masks, **kwargs)
-
+            h = layer(h, self.freqs_cis, attention_masks=attention_masks)
         h = self.norm(h) if self.norm else h
         output = self.output(h) if self.output else h
         return output
diff --git a/torchtitan/protocols/model.py b/torchtitan/protocols/model.py
@@ -71,3 +71,21 @@ def get_attention_masks(
         raise NotImplementedError(
             "This model does not support attention masking/Flex Attention."
         )
+
+    def _get_varlen_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        eos_id: int,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        raise NotImplementedError(
+            "This model does not support variable length attention."
+        )
+
+    def _get_flex_attention_masks(
+        self,
+        input_batch: torch.Tensor,
+        eos_id: int,
+        extra_inputs: dict[str, torch.Tensor] | None = None,
+    ) -> AttentionMasksType:
+        raise NotImplementedError("This model does not support flex attention.")
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -454,9 +454,13 @@ def post_dataloading_process(
         # extra_kwargs are.
         extra_kwargs: dict[str, Any] = {}
 
-        if getattr(self.model_args, "use_flex_attn", False) or getattr(
-            self.model_args, "use_varlen_attn", False
-        ):
+        attn_type = getattr(self.model_args, "attention_type", False)
+        use_varlen_attn = attn_type == "varlen"
+        use_flex_attn = (
+            getattr(self.model_args, "use_flex_attn", False) or attn_type == "flex"
+        )
+
+        if use_flex_attn or use_varlen_attn:
             extra_kwargs["attention_masks"] = self.model_parts[0].get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,