Merge pull request #1 from pytorch-labs/fmassa/compute_model

fmassa · web-flow · commit 0e9b7164d728 · 2025-06-13T11:01:57.000+02:00
Add compute cost in optimization problem
diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -0,0 +1,88 @@
+import torch
+from torch.utils._pytree import tree_map_only
+from torch.utils.flop_counter import FlopCounterMode
+
+
+def _get_device_tflops(dtype):
+    # for some reason the function from PyTorch is giving
+    # wildly different TFlops compared to the specs. I'm
+    # using had-coded values for now that I pulled from xFormers
+    # https://github.com/fairinternal/xformers/blob/main/xformers/profiler/device_limits.py
+    # TODO: fix PyTorch's implementation
+    # from torch._inductor.utils import get_device_tflops
+
+    device = None
+    device_name = torch.cuda.get_device_name(device)
+    assert "H100" in device_name, f"Only H100 supported from now, got {device_name}"
+
+    return {
+        torch.float64: 67,
+        # NOTE: NVIDIA gives all numbers "with 2:4 sparsity"
+        # but we want the full GEMM numbers
+        torch.float32: 989 // 2,
+        torch.float16: 1979 // 2,
+        torch.bfloat16: 1979 // 2,
+        torch.int8: 3958 // 2,
+    }[dtype]
+
+
+def _get_sharded_shape(spec):
+    mesh = spec.mesh
+    tensor_shape = spec.tensor_meta.shape
+    # TODO: take dtype into account as well
+    # tensor_dtype = spec.tensor_meta.dtype
+    placements = spec.placements
+    # TODO: find a better heuristic other than
+    # running DTensor
+    new_tensor_shape = list(tensor_shape)
+    for mesh_size, placement in zip(mesh.shape, placements):
+        if placement.is_shard():
+            dim = placement.dim
+            new_tensor_shape[dim] = (
+                new_tensor_shape[dim] + mesh_size - 1
+            ) // mesh_size
+    return new_tensor_shape
+
+
+def estimate_strategy_runtime_cost(node, strategy):
+    if node.op != "call_function":
+        return 0
+    # suppose only matmul-like ops
+    if not isinstance(node.target, torch._ops.OpOverload):
+        return 0
+
+    if node.target.is_view:
+        return 0
+
+    args = tree_map_only(torch.fx.Node, lambda x: x.meta["val"], node.args)
+    kwargs = tree_map_only(torch.fx.Node, lambda x: x.meta["val"], node.kwargs)
+    fake_mode = next(arg.fake_mode for arg in args if isinstance(arg, torch._subclasses.fake_tensor.FakeTensor))
+    assert len(kwargs) == 0
+    args_shapes = tuple(_get_sharded_shape(spec) for spec in strategy.input_specs)
+
+    counter = 0
+    args = list(args)
+    for i, arg in enumerate(args):
+        if isinstance(arg, torch.Tensor):
+            with fake_mode:
+                args[i] = torch.empty(args_shapes[counter], device=arg.device, dtype=arg.dtype)
+            counter += 1
+
+    # TODO: maybe cache the flop_counter to avoid recreating it
+    # all the time
+    with FlopCounterMode(display=False) as flop_counter:
+        out = node.target(*args, **kwargs)
+
+    flops = flop_counter.get_total_flops()
+
+    # TODO: fix this
+    dtype = strategy.input_specs[0].tensor_meta.dtype
+
+    # TODO: use PyTorch's version once it's giving correct results
+    gpu_flops = _get_device_tflops(dtype) * 10 ** 12
+
+    # suppose 50% efficiency for the operator
+    factor = 1 / 0.5
+    compute_time = factor * flops / gpu_flops * 1e6  # us
+
+    return compute_time
diff --git a/autoparallel/optimize_sharding.py b/autoparallel/optimize_sharding.py
@@ -6,6 +6,7 @@
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
 from torch.distributed.tensor.placement_types import Replicate, Shard
 from torch.utils._pytree import tree_flatten, tree_map_only
+from .compute_estimation import _get_sharded_shape, estimate_strategy_runtime_cost
 from .utils import get_placement_options
 
 
@@ -84,15 +85,16 @@ def build_ds(self):
                     "num_output_strat": len(s.strategies),
                 }
             for ss, ssi in enumerate(s.strategies):
+                compute_cost = estimate_strategy_runtime_cost(node, ssi)
                 for argi, xxi in enumerate(ssi.redistribute_cost):
-                    for ii, input_p in enumerate(xxi):
+                    for ii, comm_cost in enumerate(xxi):
                         va = pulp.LpVariable(
                             f"n={node},s={s_i},arg={argi},output_p={ss},input_p={ii}",
                             cat=pulp.LpBinary,
                         )
                         ds[(s_i, argi, ss, ii)] = {
                             "va": va,
-                            "cost": input_p,
+                            "cost": comm_cost + compute_cost,
                             "full_strat": ssi,
                             "out_strat": ssi.output_specs,
                             "inp_strat": ssi.input_specs[argi],
@@ -533,20 +535,8 @@ def add_parameter_memory_constraint(self, memory_factor_low, memory_factor_high)
             for ii in range(vv["num_output_strat"]):
                 data = self.ds[(s_i, 0, ii, 0)]
                 spec = data["inp_strat"]
-                mesh = spec.mesh
                 tensor_shape = spec.tensor_meta.shape
-                # TODO: take dtype into account as well
-                # tensor_dtype = spec.tensor_meta.dtype
-                placements = spec.placements
-                # TODO: find a better heuristic other than
-                # running DTensor
-                new_tensor_shape = list(tensor_shape)
-                for mesh_size, placement in zip(mesh.shape, placements):
-                    if placement.is_shard():
-                        dim = placement.dim
-                        new_tensor_shape[dim] = (
-                            new_tensor_shape[dim] + mesh_size - 1
-                        ) // mesh_size
+                new_tensor_shape = _get_sharded_shape(spec)
                 new_size = math.prod(new_tensor_shape)
                 old_size = math.prod(tensor_shape)
                 elms.append(data["va"] * new_size / old_size)