[WIP] Fix init_weights handling for param/buffer assignment

wconstab · wconstab · commit b8999185698e · 2025-08-01T09:29:57.000-07:00
init_weights is a method a user module could supply, which initializes all the parameters and buffers on the module. Currently, we handle init_weights by reparametrization: - we replace the original module's state_dict with a version containing parallel_module's states - then we run init_weights, mutating these states But if init_weights does something like `self.buf = _init_buf()` instead of doing something like `self.buf.copy_(_init_buf())`, we fail to capture this update. This PR attempts to find these missing updates and then copy them back to the parallel_module's states. 1) assuming that if init_weights did an assignment, it would not create a DTensor, becuase init_weights and orig module are supposed to be written in 'single gpu' style. 2) finding any non-DTensors in the updated state_dict and converting them to new Replicate() DTensors, following the semantic that the new assigned value should represent the global value for the state 3) copy_ into the original state DTensor on the parallel_module, since this handles the case of converting Replicate() to Shard() if needed. TODO: - verify this fixes the current init correctness problem with llama - support params (currently only implemented buffers) - support nested names (a.b.c), currently only flat names work - see if there is a better way to detect the assignment (e.g. #1 above) Make hooked setter work for initializing params/buffers ghstack-source-id: 1c0670b Pull Request resolved: #66
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -6,6 +6,7 @@
 import copy
 import itertools
 from contextlib import ExitStack
+from types import MethodType
 from typing import Optional
 
 import torch
@@ -22,10 +23,10 @@
 from torch.distributed.tensor import DeviceMesh
 from torch.export._unlift import _assign_attr
 from torch.export.unflatten import _AttrKind
-from torch.nn.utils import stateless
 
 from .apply_sharding import apply_sharding_to_model
 from .cast_parametrization import apply_dtype_cast, canonicalize_mp, set_dtype_cast
+from .init_weights import hook_params_setters
 from .optimize_sharding import ShardingOptimizer
 from .utils import _get_device_from_mesh
 
@@ -175,6 +176,11 @@ def __init__(
         # in dtype casting and move_to_fake
         model = copy.deepcopy(model)
 
+        # keep a separate copy of the fake orig model to customize for supporting init_weights
+        self.init_weights_model = move_to_fake(
+            copy.deepcopy(model), self.fake_mode, device
+        )
+
         if self.mp_policy is not None:
             apply_dtype_cast(model, self.mp_policy)
 
@@ -431,6 +437,9 @@ def forward(self, *args):
 
         self.parallel_model = AutoParallelModule()
 
+        # We construct an unflattened structure on parallel_mod,
+        # e.g. _assign_attr(v, parallel_model, k="layers.0.weight") will literally
+        # create empty nn.Modules recursively and then stash 'v' so it shows up in the right spot
         for k, v in sharded_param_dict.items():
             _assign_attr(v, self.parallel_model, k, attr_kind=_AttrKind.PARAMETER)
 
@@ -439,20 +448,18 @@ def forward(self, *args):
 
         # Right now we require a convention that the user model provides an init_weights method,
         # although we could snoop for other methods too.
+        hook_params_setters(self.init_weights_model, self.parallel_model)
         if hasattr(self.model, "init_weights"):
 
-            def init_weights(*args, **kwargs):
-                with stateless._reparametrize_module(
-                    self.model, {**sharded_param_dict, **sharded_buffer_dict}
-                ):
-                    self.model.init_weights(*args, **kwargs)
+            def init_weights(_self, *args, **kwargs):
+                # this is now a deep-fake-copy of orig mod, so we don't have to use reparametrize
+                return self.init_weights_model.init_weights(*args, **kwargs)
 
-        else:
-            init_weights = None
-
-        # assign an init_weights method onto the output mod.
-        # all it does is sneakily run the original user mod's init_weights method,
-        # but with our new DTensor sharded params attached to the user module.
-        self.parallel_model.init_weights = init_weights
+            # assign an init_weights method onto the output mod.
+            # all it does is sneakily run the original user mod's init_weights method,
+            # but with our new DTensor sharded params attached to the user module.
+            self.parallel_model.init_weights = MethodType(
+                init_weights, self.parallel_model
+            )
 
         return self.parallel_model
diff --git a/autoparallel/init_weights.py b/autoparallel/init_weights.py
@@ -0,0 +1,106 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Union
+
+import torch
+from torch._dynamo.utils import warn_once
+from torch.distributed.tensor import DTensor
+
+
+def _submod_setattr(model: torch.nn.Module, fqn: str, value: Any):
+    module_path, _, buffer_name = fqn.rpartition(".")
+    submod: torch.nn.Module = model.get_submodule(module_path)
+    setattr(submod, buffer_name, value)
+
+
+def _copy_set_value_to_dtensor(
+    fqn: str, parallel_value: DTensor, set_value: torch.Tensor
+):
+    # We expect the user wrote their module's init_weights in terms of a single-gpu model, so we do not expect
+    # set_value to be a DTensor already (since this would imply init_weights was written in a 'distributed' way),
+    # and we interpret it as a global tensor which we map to a Replicated DTensor.
+    assert not isinstance(
+        set_value, DTensor
+    ), "Expected local/full tensor from setattr in init_weights, not DTensor."
+
+    # This creates a replicated DTensor
+    new_parallel_value = DTensor.from_local(
+        set_value, device_mesh=parallel_value.device_mesh
+    )
+    if parallel_value.placements != new_parallel_value.placements:
+        warn_once(
+            f"init_weights set a new value for {fqn}, "
+            f"but the existing value is already sharded ({parallel_value.placements=},  "
+            "and it is wasteful to materialize the new value as a global tensor. "
+            "Change init_weights to perform an inplace initialization instead if possible."
+        )
+    with torch.no_grad():
+        parallel_value.copy_(new_parallel_value)
+
+
+def _build_param_property(parallel_model: torch.nn.Module, fqn: str):
+    def getter(self) -> torch.nn.Parameter:
+        param = parallel_model.get_parameter(fqn)
+        return param
+
+    def setter(self, value: Union[torch.Tensor, torch.nn.Parameter]) -> None:
+        parallel_value = parallel_model.get_parameter(fqn)
+        assert isinstance(
+            parallel_value, DTensor
+        ), "Expected parallel_module params to be DTensors"
+        _copy_set_value_to_dtensor(fqn, parallel_value, value)
+
+    return property(getter, setter)
+
+
+def _build_buffer_property(parallel_model: torch.nn.Module, fqn: str):
+    def getter(self) -> torch.Tensor:
+        return parallel_model.get_buffer(fqn)
+
+    def setter(self, value: torch.Tensor) -> None:
+        parallel_value = parallel_model.get_buffer(fqn)
+        assert isinstance(
+            parallel_value, DTensor
+        ), "Expected parallel_module params to be DTensors"
+        _copy_set_value_to_dtensor(fqn, parallel_value, value)
+
+    return property(getter, setter)
+
+
+def hook_params_setters(
+    init_weights_model: torch.nn.Module, parallel_model: torch.nn.Module
+) -> None:
+    """
+    Replaces init_weights_model's parameters with hooked properties that let us
+     (a) return a new parameter (from our parallel_mod) instead of the one on the original model,
+         similar to using stateless.reparametrize
+     (b) also, detect if anyone tries to assign a new value to the parameter, e.g.
+         self.layer.weight = nn.Parameter(torch.randn(10, 10))
+         would not be properly captured if relying on parametrization alone
+
+    Assumes init_weights_model is a deepcopy of the user's original model, with all fake params. This way we can
+    modify the model to enable init_weights to work, without affecting the user's original model.
+
+    Adds one 'property' (e.g. getter+setter) obj for each parameter name at the right spot in
+    the module hierarchy.  For self.layer.weight, this would install a 'weight' property on the self.layer
+    submodule.
+    """
+    for mod_name, mod in sorted(init_weights_model.named_modules()):
+        params_dict = dict(mod.named_parameters(recurse=False))
+        buffers_dict = dict(mod.named_buffers(recurse=False))
+
+        namespace = {}
+        for p_name in params_dict:
+            fqn = mod_name + "." + p_name
+            namespace[p_name] = _build_param_property(parallel_model, fqn)
+
+        for b_name in buffers_dict:
+            fqn = mod_name + "." + b_name
+            namespace[b_name] = _build_buffer_property(parallel_model, fqn)
+
+        cls = mod.__class__
+        # nn.Module.__setattr__ gets in the way
+        namespace["__setattr__"] = object.__setattr__
+        mod.__class__ = type(f"HookedInit{cls.__name__}", (cls,), namespace)
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 from torch import nn
+from torch.distributed.tensor.placement_types import Shard
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 from autoparallel.api import AutoParallel
@@ -62,3 +63,54 @@ def input_fn():
         auto_p.model.get_parameter("linear.weight"), torch._subclasses.FakeTensor
     )
     assert isinstance(auto_p.model.get_buffer("buf"), torch._subclasses.FakeTensor)
+
+
+def test_init(device_mesh_1d):
+    dim = 128
+
+    class Model(nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.linear = nn.Linear(dim, dim)
+            self.register_buffer("buf", torch.empty(dim))
+
+        def forward(self, x):
+            return self.linear(x) + self.buf
+
+        def init_weights(self):
+            self.linear.weight = torch.nn.Parameter(torch.ones(dim, dim) * 9.0)
+            with torch.no_grad():
+                self.linear.bias.fill_(98.6)
+            self.buf = torch.arange(dim)
+
+    def input_fn():
+        b = 512
+        inputs = (torch.rand(b, dim, device="cuda"),)
+        return inputs
+
+    with torch.device("meta"):
+        model = Model(dim)
+    with AutoParallel(
+        model,
+        input_fn,
+        device_mesh_1d,
+    ) as autop:
+        x_sharding = (Shard(0),)
+        autop.add_input_constraints([x_sharding])
+        sharding_placement = autop.optimize_placement()
+
+        # AutoParallel produces a module with meta-DTensor parameters that need to be initialized
+        parallel_mod = autop.apply_placement(sharding_placement)
+    parallel_mod.to_empty(device="cuda")
+    parallel_mod.init_weights()
+    assert torch.equal(
+        parallel_mod.get_parameter("linear.weight").full_tensor(),
+        torch.full((dim, dim), 9.0, device="cuda"),
+    )
+    assert torch.equal(
+        parallel_mod.get_parameter("linear.bias").full_tensor(),
+        torch.full((dim,), 98.6, device="cuda"),
+    )
+    assert torch.equal(
+        parallel_mod.get_buffer("buf").full_tensor(), torch.arange(dim, device="cuda")
+    )