config compile backend (#1768)

IvanKobzarev · web-flow · commit 177b050ca1c5 · 2025-10-02T11:01:38.000+02:00
Stacked PRs:
 * __-&gt;__#1768


--- --- ---

### config compile backend
diff --git a/torchtitan/components/loss.py b/torchtitan/components/loss.py
@@ -27,7 +27,7 @@ def build_cross_entropy_loss(job_config: JobConfig):
     loss_fn = cross_entropy_loss
     if job_config.compile.enable and "loss" in job_config.compile.components:
         logger.info("Compiling the loss function with torch.compile")
-        loss_fn = torch.compile(loss_fn)
+        loss_fn = torch.compile(loss_fn, backend=job_config.compile.backend)
     return loss_fn
 
 
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -566,6 +566,7 @@ class Compile:
         default_factory=lambda: ["model", "loss"]
     )
     """Which components to compile"""
+    backend: str = "inductor"
 
 
 @dataclass
diff --git a/torchtitan/experiments/flux/loss.py b/torchtitan/experiments/flux/loss.py
@@ -23,5 +23,5 @@ def build_mse_loss(job_config: JobConfig):
     loss_fn = mse_loss
     if job_config.compile.enable and "loss" in job_config.compile.components:
         logger.info("Compiling the loss function with torch.compile")
-        loss_fn = torch.compile(loss_fn)
+        loss_fn = torch.compile(loss_fn, backend=job_config.compile.backend)
     return loss_fn
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -19,6 +19,7 @@
     SequenceParallel,
 )
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.config.job_config import Compile as CompileConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.expert_parallel import (
@@ -123,7 +124,7 @@ def parallelize_llama(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        apply_compile(model)
+        apply_compile(model, job_config.compile)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
@@ -502,7 +503,7 @@ def apply_moe_ep_tp(
         )
 
 
-def apply_compile(model: nn.Module):
+def apply_compile(model: nn.Module, compile_config: CompileConfig):
     """
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
     repeated structure. Alternatively one can compile the whole model (after applying DP).
@@ -515,7 +516,11 @@ def apply_compile(model: nn.Module):
         fullgraph = True
         if transformer_block.moe_enabled:
             fullgraph = False
-        transformer_block = torch.compile(transformer_block, fullgraph=fullgraph)
+        transformer_block = torch.compile(
+            transformer_block,
+            backend=compile_config.backend,
+            fullgraph=fullgraph,
+        )
         model.layers.register_module(layer_id, transformer_block)
 
     logger.info("Compiling each TransformerBlock with torch.compile")
diff --git a/torchtitan/experiments/qwen3/infra/parallelize.py b/torchtitan/experiments/qwen3/infra/parallelize.py
@@ -118,7 +118,7 @@ def parallelize_qwen3(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        apply_compile(model)
+        apply_compile(model, job_config.compile)
 
     if parallel_dims.fsdp_enabled:
         # apply FSDP or HSDP, potentially with Context Parallel
diff --git a/torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py b/torchtitan/experiments/simple_fsdp/deepseek_v3/parallelize.py
@@ -153,6 +153,6 @@ def parallelize_deepseekv3(
     if job_config.compile.enable:
         torch._inductor.config.reorder_for_peak_memory = False
         torch._dynamo.config.capture_scalar_outputs = True
-        model = torch.compile(model, fullgraph=True)
+        model = torch.compile(model, backend=job_config.compile.backend, fullgraph=True)
 
     return model
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -122,6 +122,6 @@ def parallelize_llama(
 
     if job_config.compile.enable and "model" in job_config.compile.components:
         torch._inductor.config.reorder_for_peak_memory = False
-        model = torch.compile(model, fullgraph=True)
+        model = torch.compile(model, backend=job_config.compile.backend, fullgraph=True)
 
     return model
diff --git a/torchtitan/experiments/vlm/infra/parallelize.py b/torchtitan/experiments/vlm/infra/parallelize.py
@@ -70,8 +70,8 @@ def parallelize_vlm(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if job_config.compile.enable:
-        apply_compile(model)
-        apply_compile(model.encoder)
+        apply_compile(model, job_config.compile)
+        apply_compile(model.encoder, job_config.compile)
 
     if parallel_dims.fsdp_enabled:
         # apply FSDP or HSDP, potentially with Context Parallel
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -116,7 +116,7 @@ def parallelize_deepseekv3(
         )
 
     if model_compile_enabled:
-        apply_compile(model)
+        apply_compile(model, job_config.compile)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -23,6 +23,7 @@
 )
 
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.config.job_config import Compile as CompileConfig
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
@@ -105,7 +106,7 @@ def parallelize_llama(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        apply_compile(model)
+        apply_compile(model, job_config.compile)
 
     if parallel_dims.fsdp_enabled:
         # apply FSDP or HSDP, potentially with Context Parallel
@@ -234,13 +235,15 @@ def apply_tp(
     )
 
 
-def apply_compile(model: nn.Module):
+def apply_compile(model: nn.Module, compile_config: CompileConfig):
     """
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
     repeated structure. Alternatively one can compile the whole model (after applying DP).
     """
     for layer_id, transformer_block in model.layers.named_children():
-        transformer_block = torch.compile(transformer_block, fullgraph=True)
+        transformer_block = torch.compile(
+            transformer_block, backend=compile_config.backend, fullgraph=True
+        )
         model.layers.register_module(layer_id, transformer_block)
 
     logger.info("Compiling each TransformerBlock with torch.compile")

Original file line number	Diff line number	Diff line change
`@@ -566,6 +566,7 @@ class Compile:`
`566`	`566`	`default_factory=lambda: ["model", "loss"]`
`567`	`567`	`)`
`568`	`568`	`"""Which components to compile"""`
	`569`	`+ backend: str = "inductor"`
`569`	`570`
`570`	`571`
`571`	`572`	`@dataclass`
Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ def parallelize_deepseekv3(`
`116`	`116`	`)`
`117`	`117`
`118`	`118`	`if model_compile_enabled:`
`119`		`- apply_compile(model)`
	`119`	`+ apply_compile(model, job_config.compile)`
`120`	`120`
`121`	`121`	`dp_mesh: DeviceMesh \| None = None`
`122`	`122`	`if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:`