[WIP] Fix init_weights handling for param/buffer assignment

wconstab · wconstab · commit abfd789046c9 · 2025-08-01T07:18:38.000-07:00
init_weights is a method a user module could supply, which initializes all the parameters and buffers on the module. Currently, we handle init_weights by reparametrization: - we replace the original module's state_dict with a version containing parallel_module's states - then we run init_weights, mutating these states But if init_weights does something like `self.buf = _init_buf()` instead of doing something like `self.buf.copy_(_init_buf())`, we fail to capture this update. This PR attempts to find these missing updates and then copy them back to the parallel_module's states. 1) assuming that if init_weights did an assignment, it would not create a DTensor, becuase init_weights and orig module are supposed to be written in 'single gpu' style. 2) finding any non-DTensors in the updated state_dict and converting them to new Replicate() DTensors, following the semantic that the new assigned value should represent the global value for the state 3) copy_ into the original state DTensor on the parallel_module, since this handles the case of converting Replicate() to Shard() if needed. TODO: - verify this fixes the current init correctness problem with llama - support params (currently only implemented buffers) - support nested names (a.b.c), currently only flat names work - see if there is a better way to detect the assignment (e.g. #1 above) Make hooked setter work for initializing params/buffers ghstack-source-id: 1c0670b Pull Request resolved: #66
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -6,6 +6,7 @@
 import copy
 import itertools
 from contextlib import ExitStack
+from types import MethodType
 from typing import Optional
 
 import torch
@@ -22,10 +23,10 @@
 from torch.distributed.tensor import DeviceMesh
 from torch.export._unlift import _assign_attr
 from torch.export.unflatten import _AttrKind
-from torch.nn.utils import stateless
 
 from .apply_sharding import apply_sharding_to_model
 from .cast_parametrization import apply_dtype_cast, canonicalize_mp, set_dtype_cast
+from .init_weights import hook_params_setters
 from .optimize_sharding import ShardingOptimizer
 from .utils import _get_device_from_mesh
 
@@ -175,6 +176,11 @@ def __init__(
         # in dtype casting and move_to_fake
         model = copy.deepcopy(model)
 
+        # keep a separate copy of the fake orig model to customize for supporting init_weights
+        self.init_weights_model = move_to_fake(
+            copy.deepcopy(model), self.fake_mode, device
+        )
+
         if self.mp_policy is not None:
             apply_dtype_cast(model, self.mp_policy)
 
@@ -431,6 +437,10 @@ def forward(self, *args):
 
         self.parallel_model = AutoParallelModule()
 
+        # We construct an unflattened structure on parallel_mod,
+        # e.g. _assign_attr(v, parallel_model, k="layers.0.weight") will literally
+        # create empty nn.Modules recursively and then stash 'v' so it shows up in the right spot
+        # though, what happened to the original 'flattned' parameters on parallel_mod, did we delete those?
         for k, v in sharded_param_dict.items():
             _assign_attr(v, self.parallel_model, k, attr_kind=_AttrKind.PARAMETER)
 
@@ -439,20 +449,18 @@ def forward(self, *args):
 
         # Right now we require a convention that the user model provides an init_weights method,
         # although we could snoop for other methods too.
+        hook_params_setters(self.init_weights_model, self.parallel_model)
         if hasattr(self.model, "init_weights"):
 
-            def init_weights(*args, **kwargs):
-                with stateless._reparametrize_module(
-                    self.model, {**sharded_param_dict, **sharded_buffer_dict}
-                ):
-                    self.model.init_weights(*args, **kwargs)
+            def init_weights(_self, *args, **kwargs):
+                # this is now a deep-fake-copy of orig mod, so we don't have to use reparametrize
+                return self.init_weights_model.init_weights(*args, **kwargs)
 
-        else:
-            init_weights = None
-
-        # assign an init_weights method onto the output mod.
-        # all it does is sneakily run the original user mod's init_weights method,
-        # but with our new DTensor sharded params attached to the user module.
-        self.parallel_model.init_weights = init_weights
+            # assign an init_weights method onto the output mod.
+            # all it does is sneakily run the original user mod's init_weights method,
+            # but with our new DTensor sharded params attached to the user module.
+            self.parallel_model.init_weights = MethodType(
+                init_weights, self.parallel_model
+            )
 
         return self.parallel_model
diff --git a/autoparallel/init_weights.py b/autoparallel/init_weights.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from torch.distributed.tensor import DTensor
+
+
+def _submod_setattr(model, fqn, value):
+    module_path, _, buffer_name = fqn.rpartition(".")
+    submod: torch.nn.Module = model.get_submodule(module_path)
+    setattr(submod, buffer_name, value)
+
+
+def _build_param_property(parallel_model, fqn):
+    def getter(self, _fqn=fqn):
+        param = parallel_model.get_parameter(_fqn)
+        return param
+
+    def setter(self, value):
+        orig_value = parallel_model.get_parameter(fqn)
+        new_value = DTensor.from_local(value, device_mesh=orig_value.device_mesh)
+        if isinstance(orig_value, torch.nn.Parameter):
+            new_value = torch.nn.Parameter(new_value)
+        _submod_setattr(parallel_model, fqn, new_value)
+
+    return property(getter, setter)
+
+
+def _build_buffer_property(parallel_model, fqn):
+    def getter(self):
+        return parallel_model.get_buffer(fqn)
+
+    def setter(self, value):
+        orig_value = parallel_model.get_buffer(fqn)
+        new_value = DTensor.from_local(value, device_mesh=orig_value.device_mesh)
+        _submod_setattr(parallel_model, fqn, new_value)
+
+    return property(getter, setter)
+
+
+def hook_params_setters(model, parallel_model):
+    """
+    Replaces model's parameters with hooked properties that let us
+     (a) return a new parameter (from our parallel_mod) instead of the one on the original model,
+         similar to using stateless.reparametrize
+     (b) also, detect if anyone tries to assign a new value to the parameter, e.g.
+         self.layer.weight = nn.Parameter(torch.randn(10, 10))
+         would not be properly captured if relying on parametrization alone
+
+    Adds one 'property' (e.g. getter+setter) obj for each parameter name at the right spot in
+    the module hierarchy.  For self.layer.weight, this would install a 'weight' property on the self.layer
+    submodule.
+    """
+    for mod_name, mod in sorted(model.named_modules()):
+        params_dict = dict(mod.named_parameters(recurse=False))
+        buffers_dict = dict(mod.named_buffers(recurse=False))
+
+        namespace = {}
+        for p_name in params_dict:
+            fqn = mod_name + "." + p_name
+            namespace[p_name] = _build_param_property(parallel_model, fqn)
+
+        for b_name in buffers_dict:
+            fqn = mod_name + "." + b_name
+            namespace[b_name] = _build_buffer_property(parallel_model, fqn)
+
+        cls = mod.__class__
+        # nn.Module.__setattr__ gets in the way
+        namespace["__setattr__"] = object.__setattr__
+        mod.__class__ = type(f"HookedInit{cls.__name__}", (cls,), namespace)
+
+    return model
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -6,6 +6,7 @@
 import pytest
 import torch
 from torch import nn
+from torch.distributed.tensor.placement_types import Shard
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 from autoparallel.api import AutoParallel
@@ -62,3 +63,54 @@ def input_fn():
         auto_p.model.get_parameter("linear.weight"), torch._subclasses.FakeTensor
     )
     assert isinstance(auto_p.model.get_buffer("buf"), torch._subclasses.FakeTensor)
+
+
+def test_init(device_mesh_1d):
+    dim = 128
+
+    class Model(nn.Module):
+        def __init__(self, dim):
+            super().__init__()
+            self.linear = nn.Linear(dim, dim)
+            self.register_buffer("buf", torch.empty(dim))
+
+        def forward(self, x):
+            return self.linear(x) + self.buf
+
+        def init_weights(self):
+            self.linear.weight = torch.nn.Parameter(torch.ones(dim, dim) * 9.0)
+            with torch.no_grad():
+                self.linear.bias.fill_(98.6)
+            self.buf = torch.arange(dim)
+
+    def input_fn():
+        b = 512
+        inputs = (torch.rand(b, dim, device="cuda"),)
+        return inputs
+
+    with torch.device("meta"):
+        model = Model(dim)
+    with AutoParallel(
+        model,
+        input_fn,
+        device_mesh_1d,
+    ) as autop:
+        x_sharding = (Shard(0),)
+        autop.add_input_constraints([x_sharding])
+        sharding_placement = autop.optimize_placement()
+
+        # AutoParallel produces a module with meta-DTensor parameters that need to be initialized
+        parallel_mod = autop.apply_placement(sharding_placement)
+    parallel_mod.to_empty(device="cuda")
+    parallel_mod.init_weights()
+    assert torch.equal(
+        parallel_mod.get_parameter("linear.weight").full_tensor(),
+        torch.full((dim, dim), 9.0, device="cuda"),
+    )
+    assert torch.equal(
+        parallel_mod.get_parameter("linear.bias").full_tensor(),
+        torch.full((dim,), 98.6, device="cuda"),
+    )
+    assert torch.equal(
+        parallel_mod.get_buffer("buf").full_tensor(), torch.arange(dim, device="cuda")
+    )