reuse pipeline from torchtitan

3outeille · 3outeille · commit 09f0c94790a5 · 2025-11-13T10:06:01.000Z
diff --git a/torchtitan/distributed/pipeline_parallel.py b/torchtitan/distributed/pipeline_parallel.py
@@ -228,6 +228,7 @@ def generate_llm_fqn_per_model_part(
     num_layers: int,
     input_weight: int = 1,
     output_weight: int = 1,
+    include_rotary_emb: bool = False,
 ) -> list[list[str]]:
     """
     Programmatically generates module names model part, focused on LLMs models.
@@ -237,6 +238,7 @@ def generate_llm_fqn_per_model_part(
         num_layers: Total number of transformer layers in the model
         input_weight: Weight for input modules (tok_embeddings) in layer calculation
         output_weight: Weight for output modules (norm + output) in layer calculation
+        include_rotary_emb: Whether to include rotary_emb in each model part
 
     Returns:
         List of lists containing module names for each model part
@@ -251,7 +253,10 @@ def generate_llm_fqn_per_model_part(
     if num_stages == 1:
         # Single stage gets everything
         layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        result = [["tok_embeddings"] + layer_names + ["norm", "output"]]
+        if include_rotary_emb:
+            result[0].append("rotary_emb")
+        return result
 
     # Calculate effective layers including weights
     num_effective_layers = num_layers + input_weight + output_weight
@@ -329,6 +334,8 @@ def generate_llm_fqn_per_model_part(
                     stage_modules.append(f"layers.{current_layer}")
                     current_layer += 1
 
+        if include_rotary_emb:
+            stage_modules.append("rotary_emb")
         module_names_per_stage.append(stage_modules)
 
     return module_names_per_stage
@@ -340,6 +347,7 @@ def pipeline_module_split(
     pp_schedule: str,
     device: torch.device,
     module_names_per_stage: list[list[str]],
+    use_identity_for_missing_modules: bool = False,
 ) -> tuple[list[PipelineStage], list[nn.Module]]:
     """
     This API creates pipeline stages based on specified module names for each stage.
@@ -361,6 +369,8 @@ def pipeline_module_split(
                                - "layers.0", "layers.1" for specific transformer layers
                                - "norm" for the final normalization layer
                                - "output" for the output projection layer
+        use_identity_for_missing_modules: If True, replace missing modules with nn.Identity(),
+                                         otherwise replace with None
 
     Returns:
         Tuple of (stages, models) where stages are PipelineStage objects and models are the
@@ -417,8 +427,9 @@ def _build_stage_from_modules(
                         setattr(model, module_name, nn.ModuleList())
             # Handle simple module attributes (e.g., "linear", "norm")
             elif module_name not in modules_to_keep:
-                # Replace with None
-                setattr(model, module_name, None)
+                # Replace with Identity or None based on configuration
+                replacement = nn.Identity() if use_identity_for_missing_modules else None
+                setattr(model, module_name, replacement)
 
         stage = PipelineStage(
             model,
diff --git a/torchtitan/experiments/transformers_backend/infra/pipeline.py b/torchtitan/experiments/transformers_backend/infra/pipeline.py
@@ -3,280 +3,27 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
-import copy
 import math
 
 import torch
 import torch.nn as nn
-from torch.distributed.device_mesh import DeviceMesh
-from torch.distributed.pipelining import PipelineStage
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
     get_schedule_class,
     PipelineScheduleSingle,
-    ScheduleDualPipeV,
-    ScheduleZBVZeroBubble,
 )
 
 from torchtitan.components.loss import LossFunction
 from torchtitan.experiments.transformers_backend.job_config import JobConfig
 from torchtitan.distributed import ParallelDims
-from torchtitan.distributed.pipeline_parallel import build_pipeline_schedule
+from torchtitan.distributed.pipeline_parallel import (
+    build_pipeline_schedule,
+    generate_llm_fqn_per_model_part,
+    pipeline_module_split,
+)
 from torchtitan.protocols.train_spec import BaseModelArgs, ParallelizeFunction
 from torchtitan.tools.logging import logger
 
-# NOTE(3outeille): the only modifications comes from replacing None to nn.Identity and adding rotary_emb per model_part
-
-
-def generate_llm_fqn_per_model_part(
-    num_stages: int,
-    num_layers: int,
-    input_weight: int = 1,
-    output_weight: int = 1,
-) -> list[list[str]]:
-    """
-    Programmatically generates module names model part, focused on LLMs models.
-    Args:
-        num_stages: Number of pipeline stages
-        num_layers: Total number of transformer layers in the model
-        input_weight: Weight for input modules (embed_tokens) in layer calculation
-        output_weight: Weight for output modules (norm + output) in layer calculation
-    Returns:
-        List of lists containing module names for each model part
-    Example:
-        generate_llm_fqn_per_model_part(2, 3, input_weight=2, output_weight=2)
-        treats embeddings as 2 layers and norm+output as 2 layers for distribution
-    """
-    if num_stages < 1:
-        raise ValueError("Number of stages must be at least 1")
-
-    if num_stages == 1:
-        # Single stage gets everything
-        layer_names = [f"layers.{i}" for i in range(num_layers)]
-        return [["tok_embeddings"] + layer_names + ["norm", "output", "rotary_emb"]]
-
-    # Calculate effective layers including weights
-    num_effective_layers = num_layers + input_weight + output_weight
-
-    if num_stages > num_effective_layers:
-        raise ValueError(
-            f"Number of stages ({num_stages}) cannot be greater than effective layers ({num_effective_layers})"
-        )
-
-    # Calculate layers per stage (distribute evenly)
-    layers_per_stage = num_effective_layers // num_stages
-    extra_layers = num_effective_layers % num_stages
-
-    # Feasibility check: Ensure at least 1 layer in each PP stage
-    if layers_per_stage == 0:
-        raise ValueError(
-            f"Configuration would result in empty stages. "
-            f"With {num_stages} stages and {num_effective_layers} effective layers "
-            f"(num_layers={num_layers} + input_weight={input_weight} + output_weight={output_weight}), "
-            f"each stage would get {layers_per_stage} layers on average. "
-            f"Reduce num_stages or increase num_layers/weights."
-        )
-
-    # Balance check: Ensure weights don't exceed minimum layers per stage
-    if input_weight > layers_per_stage:
-        raise ValueError(
-            f"input_weight ({input_weight}) exceeds minimum layers per stage ({layers_per_stage})."
-        )
-    if output_weight > layers_per_stage:
-        raise ValueError(
-            f"output_weight ({output_weight}) exceeds minimum layers per stage ({layers_per_stage})."
-        )
-
-    module_names_per_stage = []
-    current_layer = 0
-
-    for stage_idx in range(num_stages):
-        stage_modules = []
-
-        # Calculate effective layers for this stage
-        effective_layers_for_stage = layers_per_stage
-        if stage_idx < extra_layers:
-            effective_layers_for_stage += 1
-
-        # First stage: handle input modules with weighting
-        if stage_idx == 0:
-            stage_modules.append("tok_embeddings")
-            # Account for input weight in layer distribution
-            remaining_layers_for_stage = effective_layers_for_stage - input_weight
-
-            # Add transformer layers
-            for _ in range(remaining_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-        # Last stage: handle output modules with weighting
-        elif stage_idx == num_stages - 1:
-            # Account for output weight in layer distribution
-            remaining_layers_for_stage = effective_layers_for_stage - output_weight
-
-            # Add transformer layers
-            for _ in range(remaining_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-            # Add output modules
-            stage_modules.extend(["norm", "output"])
-
-        # Middle stages: only transformer layers
-        else:
-            for _ in range(effective_layers_for_stage):
-                if current_layer < num_layers:
-                    stage_modules.append(f"layers.{current_layer}")
-                    current_layer += 1
-
-        stage_modules.append("rotary_emb")
-        module_names_per_stage.append(stage_modules)
-
-    return module_names_per_stage
-
-
-def pipeline_module_split(
-    whole_model: nn.Module,
-    pp_mesh: DeviceMesh,
-    pp_schedule: str,
-    device: torch.device,
-    module_names_per_stage: list[list[str]],
-) -> tuple[list[PipelineStage], list[nn.Module]]:
-    """
-    This API creates pipeline stages based on specified module names for each stage.
-
-    Some model restrictions include:
-    - forward() method should tolerate deleted layers
-    - weight initialization methods should tolerate deleted layers
-    - Does not support nested moduledict and modulelist structures
-
-    Args:
-        whole_model: The complete model to be split
-        pp_mesh: Pipeline parallel device mesh
-        pp_schedule: Name of pipeline parallelism schedule
-        device: Device
-        module_names_per_stage: List of lists, where each inner list contains the module names
-                               that should be included in that stage. Module names should be
-                               dot-separated paths. Examples:
-                               - "tok_embeddings" for token embeddings
-                               - "layers.0", "layers.1" for specific transformer layers
-                               - "norm" for the final normalization layer
-                               - "output" for the output projection layer
-
-    Returns:
-        Tuple of (stages, models) where stages are PipelineStage objects and models are the
-        corresponding model chunks
-
-    Example usage:
-        module_names_per_stage = [
-            ["tok_embeddings", "layers.0"],     # Stage 0: embeddings + first layer
-            ["layers.1", "layers.2"],           # Stage 1: middle layers
-            ["norm", "output"]                  # Stage 2: final norm + output
-        ]
-    """
-    pp_rank = pp_mesh.get_local_rank()
-    pp_degree = pp_mesh.size()
-
-    def _build_stage_from_modules(
-        stage_idx: int, module_names: list[str], num_stages: int
-    ) -> tuple[PipelineStage, nn.Module]:
-        model = copy.deepcopy(whole_model)
-
-        # Create a set of modules to keep for faster lookup
-        modules_to_keep = set(module_names)
-        for module_name, module_value in model.named_children():
-            # Handle layer-like structures (e.g., "layers.0", "layers.1")
-            if isinstance(module_value, (nn.ModuleDict, nn.ModuleList)):
-                layers_to_keep = {
-                    name.split(".", 1)[1]
-                    for name in modules_to_keep
-                    if name.startswith(f"{module_name}.")
-                }
-                if layers_to_keep:
-                    # Keep only specified layers
-                    if isinstance(module_value, nn.ModuleDict):
-                        for layer_name in list(module_value.keys()):
-                            if layer_name not in layers_to_keep:
-                                del module_value[layer_name]
-                    elif isinstance(module_value, nn.ModuleList):
-                        indices_to_keep = {
-                            int(idx) for idx in layers_to_keep if idx.isdigit()
-                        }
-                        new_layers = nn.ModuleList(
-                            [
-                                layer
-                                for i, layer in enumerate(module_value)
-                                if i in indices_to_keep
-                            ]
-                        )
-                        setattr(model, module_name, new_layers)
-                else:
-                    # No layers from this structure needed, set to empty structure
-                    if isinstance(module_value, nn.ModuleDict):
-                        setattr(model, module_name, nn.ModuleDict())
-                    elif isinstance(module_value, nn.ModuleList):
-                        setattr(model, module_name, nn.ModuleList())
-            # Handle simple module attributes (e.g., "linear", "norm")
-            elif module_name not in modules_to_keep:
-                # Replace with Identity
-                setattr(model, module_name, nn.Identity())
-
-        stage = PipelineStage(
-            model,
-            stage_idx,
-            num_stages,
-            device,
-            group=pp_mesh.get_group("pp"),
-        )
-        return stage, model
-
-    num_stages = len(module_names_per_stage)
-    stages = []
-    models = []
-
-    schedule_class = get_schedule_class(pp_schedule)
-    style = (
-        "v" if schedule_class in (ScheduleZBVZeroBubble, ScheduleDualPipeV) else "loop"
-    )
-
-    def _get_stage_indices() -> tuple[int]:
-        """
-        Compute the stage ids for the stages that will run on this pp rank
-        for either a looped or V style schedule
-        """
-        assert (
-            num_stages % pp_degree == 0
-        ), f"num_stages {num_stages} must be evenly divisible by pp_degree {pp_degree}"
-        stages_per_rank = num_stages // pp_degree
-        if style == "loop":
-            return tuple(pp_rank + s * pp_degree for s in range(stages_per_rank))
-        elif style == "v":
-            assert (
-                stages_per_rank == 2
-            ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
-            stage_v_pairs = list(
-                zip(range(pp_degree), range(num_stages - 1, pp_degree - 1, -1))
-            )
-            return stage_v_pairs[pp_rank]
-
-    for stage_idx in _get_stage_indices():
-        module_names = module_names_per_stage[stage_idx]
-        stage, model_chunk = _build_stage_from_modules(
-            stage_idx,
-            module_names,
-            num_stages,
-        )
-        logger.info(
-            f"PP rank {pp_rank} is building stage_idx {stage_idx} "
-            f"with modules {module_names}"
-        )
-        stages.append(stage)
-        models.append(model_chunk)
-
-    return stages, models
-
 
 def pipeline_hf_transformers(
     model: nn.Module,
@@ -355,7 +102,11 @@ def pipeline_hf_transformers(
     module_names_per_stage = job_config.parallelism.module_fqns_per_model_part
     if module_names_per_stage is None:
         module_names_per_stage = generate_llm_fqn_per_model_part(
-            num_virtual_stages, num_layers, input_weight, output_weight
+            num_virtual_stages,
+            num_layers,
+            input_weight,
+            output_weight,
+            include_rotary_emb=True,
         )
     for i, stage_ms in enumerate(module_names_per_stage):
         logger.debug(f"Stage {i}: {stage_ms}")
@@ -366,6 +117,7 @@ def pipeline_hf_transformers(
         job_config.parallelism.pipeline_parallel_schedule,
         device,
         module_names_per_stage,
+        use_identity_for_missing_modules=True,
     )
 
     # For PP with looped schedules, each item in model_parts is one stage-model-chunk.