Enable Zero2: No reshard after forward (#238)

sanketpurandare · web-flow · commit b1c4909fa725 · 2025-11-07T18:45:56.000-08:00
* Enable no reshard after forward (Zero2)

* Linting and logging fixes
diff --git a/autoparallel/_passes/split_fsdp_collectives.py b/autoparallel/_passes/split_fsdp_collectives.py
@@ -17,6 +17,7 @@
 from torch._inductor.fx_passes.bucketing import (
     is_all_gather_into_tensor,
     is_reduce_scatter_tensor,
+    is_wait_tensor,
 )
 
 
@@ -77,6 +78,7 @@ def split_fsdp_prefetch(
             prefetch_g_outs_map.append(param_g_in)
         else:
             w_n = next(iter(last_ag.users))
+            assert is_wait_tensor(w_n)
             prefetch_g_outs_map.append(w_n)
 
     prefetch_g_outs = prefetch_g_outs_map
@@ -126,7 +128,7 @@ def split_fsdp_reduce_scatters_epilogue(
     grad_outs_map = []
     for grad_out in grad_outs:
         n = grad_out
-        last_rs = None
+        earliest_rs = None
         while n is not None:
             if len(n.all_input_nodes) != 1:
                 break
@@ -135,12 +137,13 @@ def split_fsdp_reduce_scatters_epilogue(
                 break
             prev_n = n
             n = n_in
+            # Maybe we also need to track all_reduce?
             if is_reduce_scatter_tensor(prev_n):
                 # In AP for mesh dim > 1
                 # The reduction of gradients happen in multiple steps
-                last_rs = n
-        if last_rs is not None:
-            grad_outs_map.append(last_rs)
+                earliest_rs = n
+        if earliest_rs is not None:
+            grad_outs_map.append(earliest_rs)
         else:
             grad_outs_map.append(grad_out)
 
diff --git a/autoparallel/activation_checkpointing.py b/autoparallel/activation_checkpointing.py
@@ -66,6 +66,94 @@ def is_wait_tensor_from_fsdp(node: torch.fx.Node) -> bool:
 # mypy: ignore-errors
 
 
+def force_save_fsdp_all_gather(graph: torch.fx.Graph) -> None:
+    """
+    Force save all_gather nodes from simple fsdp in the graph.
+    This pass should be added in torch._inductor.config.joint_custom_post_pass
+    """
+    nodes_to_save = []
+    primal_origins = []
+    primals = graph.find_nodes(op="placeholder")
+    # 1. Find last all_gather from each placeholder
+    for primal in primals:
+        node = primal
+        last_ag_chain_node = None
+        while True:
+            if len(node.users) != 1:
+                break
+            user = next(iter(node.users))
+            if len(user.all_input_nodes) > 1:
+                break
+            node = user
+            if is_all_gather_into_tensor(node):
+                last_ag_chain_node = node
+        if last_ag_chain_node is not None:
+            # 2. Find last wait_tensor from last all_gather
+            last_ag_wait_node = next(iter(last_ag_chain_node.users))
+            assert is_wait_tensor(last_ag_wait_node)
+            assert is_wait_tensor_from_fsdp(last_ag_wait_node)
+            # 3. Continue the linear chain from the last wait_tensor
+            w = last_ag_wait_node
+            while True:
+                if len(w.users) != 1:
+                    # Capture this pattern:
+                    # %wait_tensor_5 : [num_users=1] = call_function[target=torch.ops._c10d_functional.wait_tensor.default](args = (%all_gather_into_tensor_5,), kwargs = {})  # noqa: E501
+                    # %split : [num_users=4] = call_function[target=torch.ops.aten.split.Tensor](args = (%wait_tensor_5, 576), kwargs = {})
+                    # %getitem_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 0), kwargs = {})
+                    # %getitem_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 1), kwargs = {})
+                    # %getitem_4 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 2), kwargs = {})
+                    # %getitem_5 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 3), kwargs = {})
+                    # %cat_1 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%getitem_2, %getitem_3, %getitem_4, %getitem_5], 1), kwargs = {})  # noqa: E501
+                    if (
+                        w.op == "call_function"
+                        and w.target == torch.ops.aten.split.Tensor
+                    ):
+                        if all(
+                            split_user.op == "call_function"
+                            and split_user.target == operator.getitem
+                            and len(split_user.users) == 1
+                            for split_user in w.users
+                        ):
+                            getitem_users = list(
+                                next(iter(getitem_node.users))
+                                for getitem_node in w.users
+                            )
+                            potential_cat_op = getitem_users[0]
+                            if all(
+                                potential_cat_op == getitem_user
+                                for getitem_user in getitem_users
+                            ) and (
+                                potential_cat_op.op == "call_function"
+                                and potential_cat_op.target
+                                == torch.ops.aten.cat.default
+                            ):
+                                w = potential_cat_op
+                                continue
+                    break
+                user = next(iter(w.users))
+                if len(user.all_input_nodes) > 1:
+                    break
+                w = user
+            # 4. Stores the last node in this chain as `last_wait_chain_user`
+            last_wait_chain_user = w
+            # 5. Check if the last node in this chain is used in backward
+            is_used_in_backward = False
+            for downstream_user in last_wait_chain_user.users:
+                if _has_tag_is_backward(downstream_user):
+                    is_used_in_backward = True
+                    break
+            if is_used_in_backward:
+                # 6. If the last node in this chain is used in backward, only then we save the wait_tensor
+                nodes_to_save.append(last_ag_wait_node)
+                primal_origins.append(primal)
+    logger.info("force_save_fsdp_all_gather, primal_origins: %s", primal_origins)
+    logger.info("force_save_fsdp_all_gather, nodes_to_save: %s", nodes_to_save)
+
+    for node in nodes_to_save:
+        node.meta["recompute"] = CheckpointPolicy.MUST_SAVE
+        node.meta["ac_graph_id"] = AP_AC_GRAPH_ID
+
+
 def force_recompute_fsdp_all_gather(graph: torch.fx.Graph) -> None:
     """
     Force recompute all_gather nodes from simple fsdp in the graph.
@@ -354,6 +442,8 @@ def ac_joint_pass(
 ):
     if reshard_after_forward:
         force_recompute_fsdp_all_gather(graph)
+    else:
+        force_save_fsdp_all_gather(graph)
     mark_nodes_as_must_save_to_stage_recomputation(
         graph, stage_size_in_GiB=ac_stage_size_in_GiB
     )
diff --git a/examples/example_pp_graph_passes.py b/examples/example_pp_graph_passes.py
@@ -39,7 +39,12 @@ def _get_pp_module_and_graphs(
 ) -> tuple[torch.nn.Module, GraphCallables, GraphMeta]:
 
     with AutoParallelPP(
-        model, tracing_input_fn, mesh, dynamic=True, reshard_after_forward=False
+        model,
+        tracing_input_fn,
+        mesh,
+        dynamic=True,
+        compile=False,
+        reshard_after_forward=False,
     ) as autop:
         autop.add_parameter_memory_constraint(low=None, high=None)