pytorch
diff --git a/‎torchtitan/distributed/activation_checkpoint.py‎
Lines changed: 5 additions & 5 deletions b/‎torchtitan/distributed/activation_checkpoint.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎torchtitan/experiments/forge/example_train.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/forge/example_train.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/experiments/gpt_oss/infra/parallelize.py‎
Lines changed: 2 additions & 4 deletions b/‎torchtitan/experiments/gpt_oss/infra/parallelize.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎torchtitan/experiments/gpt_oss/model/args.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/experiments/gpt_oss/model/args.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py‎
Lines changed: 2 additions & 4 deletions b/‎torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎torchtitan/experiments/simple_fsdp/llama3/parallelize.py‎
Lines changed: 2 additions & 1 deletion b/‎torchtitan/experiments/simple_fsdp/llama3/parallelize.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎torchtitan/experiments/vlm/infra/parallelize.py‎
Lines changed: 4 additions & 3 deletions b/‎torchtitan/experiments/vlm/infra/parallelize.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎torchtitan/experiments/vlm/model/args.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/vlm/model/args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/attention.py‎
Lines changed: 4 additions & 4 deletions b/‎torchtitan/models/attention.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 4 deletions
@@ -236,7 +236,7 @@ def _apply_ac_to_transformer_block(
     *,
     base_fqn: str | None = None,
     model_compile_enabled: bool = False,
-    attn_type: str = "sdpa",
+    use_flex_attn: bool = False,
     op_sac_save_list: set[torch._ops.OpOverload] | None = None,
 ) -> nn.Module:
     valid_ac_modes = ("full", "selective")
@@ -259,7 +259,7 @@ def _apply_ac_to_transformer_block(
 
     if use_op_sac:
         op_sac_save_list = op_sac_save_list or set()
-        if attn_type == "flex":
+        if use_flex_attn:
             """
             For Flex Attention, we need to apply SAC carefully to avoid invalidating
             torch.compile. Any torch.compile inside the SAC region will be ignored,
@@ -288,7 +288,7 @@ def apply_ac(
     ac_config: ACConfig,
     *,
     model_compile_enabled: bool = False,
-    attn_type: str = "sdpa",
+    use_flex_attn: bool = False,
     op_sac_save_list: set[torch._ops.OpOverload] | None = None,
     base_folder: str = "",
 ) -> None:
@@ -302,7 +302,7 @@ def apply_ac(
         model (nn.Module): The model to apply activation checkpointing to.
         ac_config (ACConfig): The activation checkpointing config.
         model_compile_enabled (bool): Whether torch.compile is enabled for the model.
-        attn_type (str): Attention type (one of [sdpa, varlen, flex])
+        use_flex_attn (bool): Whether flex attention is enabled for the model.
         op_sac_save_list (set[torch._ops.OpOverload]): The list of ops to save instead
             of recomputing.
     Returns:
@@ -326,7 +326,7 @@ def apply_ac(
                 ac_config,
                 base_fqn=f"layers.{layer_id}",
                 model_compile_enabled=model_compile_enabled,
-                attn_type=attn_type,
+                use_flex_attn=use_flex_attn,
                 op_sac_save_list=op_sac_save_list,
             )
             model.layers.register_module(layer_id, transformer_block)
 
@@ -161,7 +161,7 @@ def forward_backward_step(
         inputs = input_dict["input"]
         extra_kwargs = {}
 
-        if getattr(self.model_args, "use_flex_attn", False):
+        if getattr(self.model_args, "attn_type", "sdpa") == "flex":
             extra_kwargs["attention_masks"] = model_parts[0].get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,
 
@@ -62,10 +62,6 @@ def parallelize_gptoss(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
-
     if parallel_dims.tp_enabled:
         if (
             job_config.parallelism.enable_async_tensor_parallel
@@ -111,6 +107,8 @@ def parallelize_gptoss(
         job_config.compile.enable and "model" in job_config.compile.components
     )
 
+    attn_type = getattr(model.model_args, "attn_type", "sdpa")
+    use_flex_attn = attn_type == "flex"
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(
             model,
 
@@ -39,7 +39,7 @@ class GptOssModelArgs(BaseModelArgs):
         n_kv_heads (int): Number of key-value heads.
         sliding_window_size (int): Size of the sliding attention window.
         attn_mask_type (str): Type of basic attention mask.
-        use_flex_attn (bool): Whether to use FlexAttention. Only supports True.
+        attn_type (bool): Attention type, only supports Flex.
         original_seq_len (int): Original sequence length.
         rope_theta (float): Base for rotary positional encoding.
         rope_factor (float): Scaling factor for extended sequence lengths.
@@ -64,7 +64,7 @@ class GptOssModelArgs(BaseModelArgs):
     n_kv_heads: int = 8
     sliding_window_size: int = 128
     attn_mask_type: str = "causal"
-    use_flex_attn: bool = True  # NOTE: gpt-oss only support FlexAttention
+    attn_type: str = "flex"  # NOTE: gpt-oss only support FlexAttention
     # yarn
     original_seq_len: int = 4096
     rope_theta: float = 150000.0
 
@@ -67,9 +67,9 @@ def parallelize_deepseekv3(
 
     if (
         job_config.parallelism.context_parallel_degree > 1
-        and model.model_args.use_flex_attn
+        and model.model_args.attn_type != "sdpa"
     ):
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
+        raise NotImplementedError("CP support is only supported for SDPA.")
 
     if parallel_dims.tp_enabled:
         enable_float8_linear = "float8" in job_config.model.converters
@@ -85,13 +85,11 @@ def parallelize_deepseekv3(
                 "Currently, float8 tensorwise TP is not tested for deepseekv3"
             )
 
-        use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
         apply_non_moe_tp(
             model,
             world_mesh["tp"],
             loss_parallel=not job_config.parallelism.disable_loss_parallel,
             enable_float8_tensorwise_tp=False,
-            use_flex_attn=use_flex_attn,
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
 
@@ -102,7 +102,8 @@ def parallelize_llama(
         maybe_enable_async_tp(job_config, tp_mesh)
 
     if job_config.activation_checkpoint.mode != "none":
-        use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
+        attn_type = getattr(model.model_args, "attn_type", "sdpa")
+        use_flex_attn = attn_type == "flex"
         model_compile_enabled = (
             job_config.compile.enable and "model" in job_config.compile.components
         )
 
@@ -48,16 +48,17 @@ def parallelize_vlm(
         Sequence length {job_config.training.seq_len} must be divisible by the product of TP degree
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
-        raise NotImplementedError("CP support for FlexAttention is still in progress.")
+    attn_type = getattr(model.model_args, "attn_type", "sdpa")
+    if job_config.parallelism.context_parallel_degree > 1 and attn_type != "sdpa":
+        raise NotImplementedError("CP support is only supported for SDPA.")
 
     if parallel_dims.tp_enabled:
         raise NotImplementedError("TP support for VLM training is still in progress.")
 
     model_compile_enabled = (
         job_config.compile.enable and "model" in job_config.compile.components
     )
+    use_flex_attn = attn_type == "flex"
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(
             model,
 
@@ -53,7 +53,7 @@ class Siglip2ModelArgs:
     spatial_merge_size: int = 1
 
     layer_norm_eps: float = 1e-6
-    use_flex_attn: bool = True
+    attn_type: str = "flex"
     attn_mask_type: str = "causal"
 
 
 
@@ -60,17 +60,16 @@ def forward(
         xv: torch.Tensor,
         head_dim: torch.Tensor,
         attention_masks: VarlenMetadata,
-        is_causal: bool = True,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         cu_seq_q = attention_masks.cu_seq_q
         cu_seq_k = attention_masks.cu_seq_k
         max_q = attention_masks.max_q
         max_k = attention_masks.max_k
 
         n_local_heads = xq.shape[1]
-        xq_packed = xq.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
-        xk_packed = xk.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
-        xv_packed = xv.transpose(1, 2).contiguous().view(-1, n_local_heads, head_dim)
+        xq_packed = xq.transpose(1, 2).reshape(-1, n_local_heads, head_dim)
+        xk_packed = xk.transpose(1, 2).reshape(-1, n_local_heads, head_dim)
+        xv_packed = xv.transpose(1, 2).reshape(-1, n_local_heads, head_dim)
 
         return VarlenAttentionWrapper._compiled_varlen_attn(
             xq_packed,
@@ -325,6 +324,7 @@ def create_varlen_metadata_for_document(
     max_seqlen = 0
     if len(all_seq_lengths) > 0:
         all_seq_lengths = torch.cat(all_seq_lengths)
+        # device to host sync but only done once per model forward
         max_seqlen = all_seq_lengths.max().item()
 
     return VarlenMetadata(
 
@@ -72,7 +72,7 @@
         qk_rope_head_dim=64,
         v_head_dim=128,
         mscale=0.70,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "16B": DeepSeekV3ModelArgs(
@@ -97,7 +97,7 @@
         qk_rope_head_dim=64,
         v_head_dim=128,
         mscale=0.70,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "236B": DeepSeekV3ModelArgs(
@@ -124,7 +124,7 @@
         qk_nope_head_dim=128,
         qk_rope_head_dim=64,
         v_head_dim=128,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "671B": DeepSeekV3ModelArgs(
@@ -151,7 +151,7 @@
         qk_nope_head_dim=128,
         qk_rope_head_dim=64,
         v_head_dim=128,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
 }
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,8 @@ def parallelize_llama(`
`102`	`102`	`maybe_enable_async_tp(job_config, tp_mesh)`
`103`	`103`
`104`	`104`	`if job_config.activation_checkpoint.mode != "none":`
`105`		`- use_flex_attn = getattr(model.model_args, "use_flex_attn", False)`
	`105`	`+ attn_type = getattr(model.model_args, "attn_type", "sdpa")`
	`106`	`+ use_flex_attn = attn_type == "flex"`
`106`	`107`	`model_compile_enabled = (`
`107`	`108`	`job_config.compile.enable and "model" in job_config.compile.components`
`108`	`109`	`)`