Fix ModuleDict wrapping

xmfan · xmfan · commit b12845af1c96 · 2025-11-20T17:54:00.000-08:00
stack-info: PR: #260, branch: xmfan/stack/24
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -52,6 +52,50 @@
 _APPLY_VIEW_MM_VIEW_PATTERN = False
 
 
+# Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
+# This installs empty Modules where none exist yet if they are subpaths of target
+def assign_attr(
+    from_module: torch.nn.Module,
+    from_obj: Union[torch.Tensor, torch.ScriptObject, torch.nn.Module],
+    to_module: torch.nn.Module,
+    target: str,
+    attr_kind: _AttrKind,
+    persistent: bool = True,
+):
+    # _assign_attr assumes we should assign every field as nn.Module
+    # this patch adds support for nn.ModuleDict (used by torchtitan)
+    *prefix, field = target.split(".")
+    module_map = {to_module: from_module}
+    for item in prefix:
+        submod_map: dict[torch.nn.Module, torch.nn.Module] = {}
+        for t_module, f_module in module_map.items():
+            if not hasattr(t_module, item):
+                from_item = getattr(f_module, item, None)
+                if isinstance(from_item, torch.nn.ModuleDict):
+                    setattr(t_module, item, torch.nn.ModuleDict())
+                elif isinstance(from_item, torch.nn.Module):
+                    setattr(t_module, item, torch.nn.Module())
+                else:
+                    raise RuntimeError(
+                        f"Unsupported type {type(from_item)} for item {item}"
+                    )
+            from_children = f_module._modules.items()
+            to_children = t_module._modules.items()
+            # >= may seem odd, but it's because to_module is being mutated
+            assert len(from_children) >= len(to_children)
+            new_submods = {}
+            for (f_attr_name, f_call), (t_attr_name, t_call) in zip(
+                from_children, to_children
+            ):
+                assert f_attr_name == t_attr_name
+                new_submods[t_call] = f_call
+
+            submod_map.update(new_submods)
+        module_map = submod_map
+
+    _assign_attr(from_obj, to_module, target, attr_kind, persistent)
+
+
 def _get_decomp_table():
     decomp_table = copy.copy(select_decomp_table())
     # TODO: removing those as they cause missing DTensor propagation rules
@@ -550,10 +594,14 @@ def _register_params_and_init_weights(
         # e.g. _assign_attr(v, parallel_model, k="layers.0.weight") will literally
         # create empty nn.Modules recursively and then stash 'v' so it shows up in the right spot
         for k, v in sharded_param_dict.items():
-            _assign_attr(v, self.parallel_model, k, attr_kind=_AttrKind.PARAMETER)
+            assign_attr(
+                self.model, v, self.parallel_model, k, attr_kind=_AttrKind.PARAMETER
+            )
 
         for k, v in sharded_buffer_dict.items():
-            _assign_attr(v, self.parallel_model, k, attr_kind=_AttrKind.BUFFER)
+            assign_attr(
+                self.model, v, self.parallel_model, k, attr_kind=_AttrKind.BUFFER
+            )
 
         # Right now we require a convention that the user model provides an init_weights method,
         # although we could snoop for other methods too.
@@ -644,10 +692,10 @@ def _register_params_and_buffers(self, sharded_param_dict, sharded_buffer_dict):
         # e.g. _assign_attr(v, parallel_model, k="layers.0.weight") will literally
         # create empty nn.Modules recursively and then stash 'v' so it shows up in the right spot
         for k, v in sharded_param_dict.items():
-            _assign_attr(v, self, k, attr_kind=_AttrKind.PARAMETER)
+            assign_attr(self.model, v, self, k, attr_kind=_AttrKind.PARAMETER)
 
         for k, v in sharded_buffer_dict.items():
-            _assign_attr(v, self, k, attr_kind=_AttrKind.BUFFER)
+            assign_attr(self.model, v, self, k, attr_kind=_AttrKind.BUFFER)
 
     def forward(self, *args):
         raise NotImplementedError("This is a placeholder for the pipeline model")
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -305,3 +305,41 @@ def input_fn():
     #     %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%primals_1, 10), kwargs = {})
     #     %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_1, 30), kwargs = {})
     #     return ((add, add_2), (tangents_1, None))
+
+
+def test_torchtitan_module_dict(device_mesh_1d):
+    class ToyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = nn.ModuleDict(
+                {
+                    "layer_0": torch.nn.Linear(10, 10),
+                    "layer_1": torch.nn.Linear(10, 10),
+                    "layer_2": torch.nn.Linear(10, 10),
+                }
+            )
+
+        def forward(self, x):
+            for layer in self.layers.values():
+                x = layer(x)
+            return x
+
+    def input_fn():
+        return torch.rand(10, 10, device="cuda")
+
+    with torch.device("meta"):
+        model = ToyModel()
+
+    with AutoParallel(model, input_fn, device_mesh_1d) as autop:
+        autop.add_parameter_memory_constraint(low=None, high=None)
+
+        x_sharding = (Replicate(),)
+
+        autop.add_input_constraints([x_sharding])
+        autop.add_output_constraints([x_sharding])
+
+        sharding_placement = autop.optimize_placement(verbose=False)
+        parallel_mod = autop.apply_placement(sharding_placement)
+
+    assert isinstance(model.layers, torch.nn.ModuleDict)
+    assert isinstance(parallel_mod.layers, torch.nn.ModuleDict)