remove use_flex_attn

liangel-02 · liangel-02 · commit 4d80f4e9c0fa · 2025-11-19T10:07:30.000-08:00
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -236,7 +236,7 @@ def _apply_ac_to_transformer_block(
     *,
     base_fqn: str | None = None,
     model_compile_enabled: bool = False,
-    use_flex_attn: bool = False,
+    attn_type: str = "sdpa",
     op_sac_save_list: set[torch._ops.OpOverload] | None = None,
 ) -> nn.Module:
     valid_ac_modes = ("full", "selective")
@@ -259,7 +259,7 @@ def _apply_ac_to_transformer_block(
 
     if use_op_sac:
         op_sac_save_list = op_sac_save_list or set()
-        if use_flex_attn:
+        if attn_type == "flex":
             """
             For Flex Attention, we need to apply SAC carefully to avoid invalidating
             torch.compile. Any torch.compile inside the SAC region will be ignored,
@@ -288,7 +288,7 @@ def apply_ac(
     ac_config: ACConfig,
     *,
     model_compile_enabled: bool = False,
-    use_flex_attn: bool = False,
+    attn_type: str = "sdpa",
     op_sac_save_list: set[torch._ops.OpOverload] | None = None,
     base_folder: str = "",
 ) -> None:
@@ -302,7 +302,7 @@ def apply_ac(
         model (nn.Module): The model to apply activation checkpointing to.
         ac_config (ACConfig): The activation checkpointing config.
         model_compile_enabled (bool): Whether torch.compile is enabled for the model.
-        use_flex_attn (bool): Whether flex attention is enabled for the model.
+        attn_type (str): Attention type (one of [sdpa, varlen, flex])
         op_sac_save_list (set[torch._ops.OpOverload]): The list of ops to save instead
             of recomputing.
     Returns:
@@ -326,7 +326,7 @@ def apply_ac(
                 ac_config,
                 base_fqn=f"layers.{layer_id}",
                 model_compile_enabled=model_compile_enabled,
-                use_flex_attn=use_flex_attn,
+                attn_type=attn_type,
                 op_sac_save_list=op_sac_save_list,
             )
             model.layers.register_module(layer_id, transformer_block)
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -67,11 +67,6 @@ def parallelize_llama(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    attn_type = getattr(model.model_args, "attention_type", False)
-    use_flex_attn = attn_type == "flex"
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
-
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
         float8_is_rowwise = job_config.quantize.linear.float8.recipe_name in (
@@ -96,12 +91,13 @@ def parallelize_llama(
         job_config.compile.enable and "model" in job_config.compile.components
     )
 
+    attn_type = getattr(model.model_args, "attention_type", False)
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(
             model,
             job_config.activation_checkpoint,
             model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
             op_sac_save_list=_op_sac_save_list,
             base_folder=job_config.job.dump_folder,
         )
diff --git a/torchtitan/models/llama3/model/args.py b/torchtitan/models/llama3/model/args.py
@@ -44,7 +44,7 @@ class TransformerModelArgs(BaseModelArgs):
     # `False`, each uses the total number of transformer blocks
     depth_init: bool = True
 
-    attention_type: Literal["flex", "varlen"] = None
+    attention_type: Literal["flex", "varlen"] = "sdpa"
     attn_mask_type: str = "causal"
     eos_id: int = 0
 
@@ -58,7 +58,7 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
 
         if (
             job_config.parallelism.context_parallel_degree > 1
-            and self.attention_type == "flex"
+            and self.attention_type != "sdpa"
         ):
             raise NotImplementedError(
                 "CP support for FlexAttention is still in progress."
diff --git a/torchtitan/models/llama3/model/model.py b/torchtitan/models/llama3/model/model.py
@@ -194,14 +194,14 @@ def __init__(self, model_args: TransformerModelArgs):
             model_args.n_heads * self.head_dim, model_args.dim, bias=False
         )
 
-        self.use_flex_attn = model_args.attention_type == "flex"
-        self.use_varlen_attn = model_args.attention_type == "varlen"
-        if self.use_flex_attn:
-            self.inner_attention = FlexAttentionWrapper()
-        elif self.use_varlen_attn:
-            self.inner_attention = VarlenAttentionWrapper()
-        else:
-            self.inner_attention = ScaledDotProductAttentionWrapper()
+        self.attn_type = model_args.attention_type
+        match self.attn_type:
+            case "flex":
+                self.inner_attention = FlexAttentionWrapper()
+            case "varlen":
+                self.inner_attention = VarlenAttentionWrapper()
+            case _:
+                self.inner_attention = ScaledDotProductAttentionWrapper()
 
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
@@ -246,22 +246,23 @@ def forward(
         xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
 
-        if self.use_flex_attn:
-            assert isinstance(attention_masks, BlockMask), attention_masks
-            output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
-        elif self.use_varlen_attn:
-            assert isinstance(attention_masks, VarlenMetadata), attention_masks
-            output = self.inner_attention(
-                xq,
-                xk,
-                xv,
-                self.head_dim,
-                attention_masks,
-                is_causal=True,
-            )
-        else:
-            assert attention_masks is None
-            output = self.inner_attention(xq, xk, xv)
+        match self.attn_type:
+            case "flex":
+                assert isinstance(attention_masks, BlockMask), attention_masks
+                output = self.inner_attention(xq, xk, xv, block_mask=attention_masks)
+            case "varlen":
+                assert isinstance(attention_masks, VarlenMetadata), attention_masks
+                output = self.inner_attention(
+                    xq,
+                    xk,
+                    xv,
+                    self.head_dim,
+                    attention_masks,
+                    is_causal=True,
+                )
+            case _:
+                assert attention_masks is None
+                output = self.inner_attention(xq, xk, xv)
 
         output = output.transpose(
             1, 2
@@ -468,7 +469,7 @@ def _precompute_freqs_cis(self) -> torch.Tensor:
     def _get_flex_attention_masks(
         self,
         input_batch: torch.Tensor,
-        eos_id: int,
+        tokenizer: BaseTokenizer,
         extra_inputs: dict[str, torch.Tensor] | None = None,
     ) -> AttentionMasksType:
         mask_mods = [get_causal_mask_mod()]
@@ -478,7 +479,7 @@ def _get_flex_attention_masks(
                 B = 1
             case "block_causal":
                 B = input_batch.shape[0]
-                mask_mods.append(get_document_mask_mod(input_batch, eos_id))
+                mask_mods.append(get_document_mask_mod(input_batch, tokenizer.eos_id))
             case _:
                 raise ValueError(
                     f"Unknown attention mask type: {self.model_args.attn_mask_type}"
@@ -505,7 +506,7 @@ def get_attention_masks(
         match self.model_args.attention_type:
             case "flex":
                 return self._get_flex_attention_masks(
-                    input_batch, tokenizer.eos_id, extra_inputs
+                    input_batch, tokenizer, extra_inputs
                 )
             case "varlen":
                 return self._get_varlen_attention_masks(
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -455,12 +455,7 @@ def post_dataloading_process(
         extra_kwargs: dict[str, Any] = {}
 
         attn_type = getattr(self.model_args, "attention_type", False)
-        use_varlen_attn = attn_type == "varlen"
-        use_flex_attn = (
-            getattr(self.model_args, "use_flex_attn", False) or attn_type == "flex"
-        )
-
-        if use_flex_attn or use_varlen_attn:
+        if attn_type in ["flex", "varlen"]:
             extra_kwargs["attention_masks"] = self.model_parts[0].get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,