pytorch
diff --git a/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 11 additions & 11 deletions b/‎tests/unit_tests/test_activation_checkpoint.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎torchtitan/experiments/forge/example_train.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/forge/example_train.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/experiments/gpt_oss/infra/parallelize.py‎
Lines changed: 3 additions & 3 deletions b/‎torchtitan/experiments/gpt_oss/infra/parallelize.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchtitan/experiments/gpt_oss/model/args.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/experiments/gpt_oss/model/args.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py‎
Lines changed: 3 additions & 3 deletions b/‎torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchtitan/experiments/simple_fsdp/llama3/parallelize.py‎
Lines changed: 2 additions & 2 deletions b/‎torchtitan/experiments/simple_fsdp/llama3/parallelize.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchtitan/experiments/vlm/infra/parallelize.py‎
Lines changed: 3 additions & 3 deletions b/‎torchtitan/experiments/vlm/infra/parallelize.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎torchtitan/experiments/vlm/model/args.py‎
Lines changed: 1 addition & 1 deletion b/‎torchtitan/experiments/vlm/model/args.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎torchtitan/models/deepseek_v3/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torchtitan/models/deepseek_v3/infra/parallelize.py‎
Lines changed: 6 additions & 6 deletions b/‎torchtitan/models/deepseek_v3/infra/parallelize.py‎
Lines changed: 6 additions & 6 deletions
@@ -84,7 +84,7 @@ def get_bw_flops(model_fn):
             model_selective_ac,
             ac_config_no_force,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         flops_selective_ac = get_bw_flops(model_selective_ac)
@@ -102,7 +102,7 @@ def get_bw_flops(model_fn):
             model_with_force_first,
             ac_config_with_force_first,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         flops_with_force_first = get_bw_flops(model_with_force_first)
@@ -119,7 +119,7 @@ def get_bw_flops(model_fn):
             model_with_force_last,
             ac_config_with_force_last,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         flops_with_force_last = get_bw_flops(model_with_force_last)
@@ -134,7 +134,7 @@ def get_bw_flops(model_fn):
             model_with_full_ac,
             ac_config_full_ac,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         flops_full_ac = get_bw_flops(model_with_full_ac)
@@ -177,7 +177,7 @@ def get_act_mem(model_fn):
             model_selective_ac,
             ac_config_no_force,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         mem_selective_ac = get_act_mem(model_selective_ac)
@@ -194,7 +194,7 @@ def get_act_mem(model_fn):
             model_with_force_first,
             ac_config_with_force_first,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         mem_with_force_first = get_act_mem(model_with_force_first)
@@ -210,7 +210,7 @@ def get_act_mem(model_fn):
             model_with_force_last,
             ac_config_with_force_last,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         mem_with_force_last = get_act_mem(model_with_force_last)
@@ -224,7 +224,7 @@ def get_act_mem(model_fn):
             model_with_full_ac,
             ac_config_full_ac,
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         mem_full_ac = get_act_mem(model_with_full_ac)
@@ -251,7 +251,7 @@ def test_correctness(self):
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
             ),
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
         model_force_first = ToyModule()
@@ -264,7 +264,7 @@ def test_correctness(self):
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
             ),
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
 
@@ -278,7 +278,7 @@ def test_correctness(self):
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
             ),
             model_compile_enabled=False,
-            use_flex_attn=False,
+            attn_type="sdpa",
             op_sac_save_list=_op_sac_save_list,
         )
 
 
@@ -161,7 +161,7 @@ def forward_backward_step(
         inputs = input_dict["input"]
         extra_kwargs = {}
 
-        if getattr(self.model_args, "use_flex_attn", False):
+        if getattr(self.model_args, "attn_type", "sdpa") == "flex":
             extra_kwargs["attention_masks"] = model_parts[0].get_attention_masks(
                 input_batch=inputs,
                 tokenizer=self.tokenizer,
 
@@ -62,8 +62,8 @@ def parallelize_gptoss(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
+    attn_type = getattr(model.model_args, "attn_type", "sdpa")
+    if job_config.parallelism.context_parallel_degree > 1 and attn_type == "flex":
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
@@ -116,7 +116,7 @@ def parallelize_gptoss(
             model,
             job_config.activation_checkpoint,
             model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
             op_sac_save_list=_op_sac_save_list,
         )
 
 
@@ -39,7 +39,7 @@ class GptOssModelArgs(BaseModelArgs):
         n_kv_heads (int): Number of key-value heads.
         sliding_window_size (int): Size of the sliding attention window.
         attn_mask_type (str): Type of basic attention mask.
-        use_flex_attn (bool): Whether to use FlexAttention. Only supports True.
+        attn_type (bool): Attention type, only supports Flex.
         original_seq_len (int): Original sequence length.
         rope_theta (float): Base for rotary positional encoding.
         rope_factor (float): Scaling factor for extended sequence lengths.
@@ -64,7 +64,7 @@ class GptOssModelArgs(BaseModelArgs):
     n_kv_heads: int = 8
     sliding_window_size: int = 128
     attn_mask_type: str = "causal"
-    use_flex_attn: bool = True  # NOTE: gpt-oss only support FlexAttention
+    attn_type: str = "flex"  # NOTE: gpt-oss only support FlexAttention
     # yarn
     original_seq_len: int = 4096
     rope_theta: float = 150000.0
 
@@ -67,7 +67,7 @@ def parallelize_deepseekv3(
 
     if (
         job_config.parallelism.context_parallel_degree > 1
-        and model.model_args.use_flex_attn
+        and model.model_args.attn_type == "flex"
     ):
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
@@ -85,13 +85,13 @@ def parallelize_deepseekv3(
                 "Currently, float8 tensorwise TP is not tested for deepseekv3"
             )
 
-        use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
+        attn_type = getattr(model.model_args, "attn_type", "sdpa")
         apply_non_moe_tp(
             model,
             world_mesh["tp"],
             loss_parallel=not job_config.parallelism.disable_loss_parallel,
             enable_float8_tensorwise_tp=False,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
 
@@ -102,15 +102,15 @@ def parallelize_llama(
         maybe_enable_async_tp(job_config, tp_mesh)
 
     if job_config.activation_checkpoint.mode != "none":
-        use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
+        attn_type = getattr(model.model_args, "attn_type", "sdpa")
         model_compile_enabled = (
             job_config.compile.enable and "model" in job_config.compile.components
         )
         apply_ac(
             model,
             job_config.activation_checkpoint,
             model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
             op_sac_save_list=_op_sac_save_list,
             base_folder=job_config.job.dump_folder,
         )
 
@@ -48,8 +48,8 @@ def parallelize_vlm(
         Sequence length {job_config.training.seq_len} must be divisible by the product of TP degree
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
+    attn_type = getattr(model.model_args, "attn_type", "sdpa")
+    if job_config.parallelism.context_parallel_degree > 1 and attn_type == "flex":
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
@@ -63,7 +63,7 @@ def parallelize_vlm(
             model,
             job_config.activation_checkpoint,
             model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
             op_sac_save_list=_op_sac_save_list,
         )
         apply_ac(model.encoder, job_config.activation_checkpoint)
 
@@ -53,7 +53,7 @@ class Siglip2ModelArgs:
     spatial_merge_size: int = 1
 
     layer_norm_eps: float = 1e-6
-    use_flex_attn: bool = True
+    attn_type: str = "flex"
     attn_mask_type: str = "causal"
 
 
 
@@ -72,7 +72,7 @@
         qk_rope_head_dim=64,
         v_head_dim=128,
         mscale=0.70,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "16B": DeepSeekV3ModelArgs(
@@ -97,7 +97,7 @@
         qk_rope_head_dim=64,
         v_head_dim=128,
         mscale=0.70,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "236B": DeepSeekV3ModelArgs(
@@ -124,7 +124,7 @@
         qk_nope_head_dim=128,
         qk_rope_head_dim=64,
         v_head_dim=128,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
     "671B": DeepSeekV3ModelArgs(
@@ -151,7 +151,7 @@
         qk_nope_head_dim=128,
         qk_rope_head_dim=64,
         v_head_dim=128,
-        use_flex_attn=True,
+        attn_type="flex",
         attn_mask_type="block_causal",
     ),
 }
 
@@ -61,8 +61,8 @@ def parallelize_deepseekv3(
         ({parallel_dims.tp}) and 2 * CP degree ({parallel_dims.cp}).
         """
 
-    use_flex_attn = getattr(model.model_args, "use_flex_attn", False)
-    if job_config.parallelism.context_parallel_degree > 1 and use_flex_attn:
+    attn_type = getattr(model.model_args, "attn_type", "sdpa")
+    if job_config.parallelism.context_parallel_degree > 1 and attn_type == "flex":
         raise NotImplementedError("CP support for FlexAttention is still in progress.")
 
     if parallel_dims.tp_enabled:
@@ -84,7 +84,7 @@ def parallelize_deepseekv3(
             world_mesh["tp"],
             loss_parallel=not job_config.parallelism.disable_loss_parallel,
             enable_float8_tensorwise_tp=False,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
         )
         maybe_enable_async_tp(job_config, world_mesh["tp"])
 
@@ -112,7 +112,7 @@ def parallelize_deepseekv3(
             model,
             job_config.activation_checkpoint,
             model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
+            attn_type=attn_type,
             op_sac_save_list=_op_sac_save_list,
             base_folder=job_config.job.dump_folder,
         )
@@ -181,7 +181,7 @@ def apply_non_moe_tp(
     tp_mesh: DeviceMesh,
     loss_parallel: bool,
     enable_float8_tensorwise_tp: bool,
-    use_flex_attn: bool,
+    attn_type: str,
 ):
     """Apply tensor parallelism."""
     # 1. Parallelize the embedding and shard its outputs (which are the first
@@ -211,7 +211,7 @@ def apply_non_moe_tp(
         PrepareModuleInput,
     )
 
-    if use_flex_attn:
+    if attn_type == "flex":
         attention_kernel_plan = prepare_module_input(
             input_layouts=(Shard(1), Shard(1), Shard(1)),
             desired_input_layouts=(Shard(1), Shard(1), Shard(1)),