simplify embedding + first transformer block TP (#314)

wanchaol · web-flow · commit f5a3ad75ab3d · 2024-05-07T19:27:43.000-07:00
as titled, we can directly specify the rowwise parallel embedding output
layouts be shard on sequence dim, so that we don't need the first layer
prepare input.

Switching to output_layouts = Shard(1) would also trigger reduce_scatter
instead of allreduce for embedding layer, which could give some small
perf wins
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -160,18 +160,14 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
             {
                 "tok_embeddings": RowwiseParallel(
                     input_layouts=Replicate(),
+                    output_layouts=Shard(1),
                 ),
                 "output": col_parallel_strategy(
                     input_layouts=Shard(1),
                     output_layouts=(Shard(-1) if loss_parallel else Replicate()),
                     use_local_output=not loss_parallel,
                 ),
                 "norm": SequenceParallel(),
-                "layers.0": PrepareModuleInput(
-                    input_layouts=(Replicate(), None),
-                    desired_input_layouts=(Shard(1), None),
-                    use_local_output=True,
-                ),
             },
         )