[compiler toolkit] Port manual bucketing from SimpleFSDP experiment (#2056)

yiming0416 · web-flow · commit bfdc974f94e0 · 2025-11-18T14:44:12.000-08:00
This PR integrates the manual bucketing pass (transformer block bucketing) added in SimpleFSDP experiment (#1881) to compiler toolkit So now compiler toolkit can also run manual bucketing pass by specifying the config ``` NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes transformer_block_bucketing ``` Also updated README and integration test to include the newly ported pass
diff --git a/torchtitan/experiments/compiler_toolkit/README.md b/torchtitan/experiments/compiler_toolkit/README.md
@@ -34,6 +34,11 @@ NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./r
 NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes autobucketing_reordering
 ```
 
+**SimpleFSDP + TP + transformer-block-bucketing**
+```shell
+NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes transformer_block_bucketing
+```
+
 **SimpleFSDP + TP + FlexAttention**
 ```shell
 NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --model.flavor=debugmodel_flex_attn
@@ -44,3 +49,9 @@ NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./r
 ```shell
 NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes autobucketing_reordering,regional_inductor
 ```
+
+**SimpleFSDP + TP + FlexAttention + transformer-block-bucketing + regional-inductor**
+
+```shell
+NGPU=8 CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name compiler_toolkit.llama3 --parallelism.data_parallel_shard_degree=2 --parallelism.tensor_parallel_degree=4 --job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config --compile.passes transformer_block_bucketing,regional_inductor
+```
diff --git a/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py b/torchtitan/experiments/compiler_toolkit/deepseek_v3/parallelize.py
@@ -80,7 +80,7 @@ def parallelize_deepseekv3(
     joint_custom_passes = get_joint_custom_passes_from_config(parallel_dims, job_config)
 
     # Get compiler passes from config
-    compiler_passes = get_compiler_passes_from_config(job_config)
+    compiler_passes = get_compiler_passes_from_config(model, job_config)
 
     # Create compilers with specified passes (defaults to no passes)
     fw_compiler, bw_compiler = make_compiler_with_passes(
diff --git a/torchtitan/experiments/compiler_toolkit/graph_utils.py b/torchtitan/experiments/compiler_toolkit/graph_utils.py
@@ -112,7 +112,7 @@ def joint_graph_builder(
         tracing_context,
     ) = export_joint(model, model_args, model_kwargs, dump_folder=dump_folder)
 
-    # Optional validation
+    # run custom passes on joint-graph before partitioner
     if joint_custom_passes is not None:
         for joint_custom_pass in joint_custom_passes:
             joint_with_descriptors.graph_module = joint_custom_pass(
@@ -240,7 +240,12 @@ def compiler(
     _dump_gm(dump_folder, gm, f"{name}_before_compiler")
 
     for pass_fn in passes:
-        logger.info(f"Applying pass: {pass_fn.__name__}")
+        pass_name = (
+            pass_fn.func.__name__
+            if isinstance(pass_fn, functools.partial)
+            else pass_fn.__name__
+        )
+        logger.info(f"Applying pass: {pass_name}")
         gm = pass_fn(gm, example_inputs)
 
     logger.debug(f"{name} after compiler:")
@@ -277,7 +282,7 @@ def bw_compiler(gm: torch.fx.GraphModule, example_inputs) -> None:
     return fw_compiler, bw_compiler
 
 
-def get_compiler_passes_from_config(job_config: JobConfig):
+def get_compiler_passes_from_config(model: torch.nn.Module, job_config: JobConfig):
     """
     Extract and validate compiler passes from job config.
 
@@ -288,8 +293,18 @@ def get_compiler_passes_from_config(job_config: JobConfig):
         List of compiler pass functions
     """
     from torchtitan.experiments.compiler_toolkit.passes import AVAILABLE_COMPILER_PASSES
+    from torchtitan.experiments.simple_fsdp.llama3.parallelize import (
+        get_transformer_block_buckets,
+    )
 
     pass_names = getattr(job_config.compile, "passes", [])
+    if (
+        "autobucketing_reordering" in pass_names
+        and "transformer_block_bucketing" in pass_names
+    ):
+        raise ValueError(
+            "Cannot apply autobucketing_reordering and transformer_block_bucketing at the same time!"
+        )
     compiler_passes = []
 
     for pass_name in pass_names:
@@ -298,7 +313,15 @@ def get_compiler_passes_from_config(job_config: JobConfig):
                 f"Unknown compiler pass: {pass_name}. "
                 f"Available compiler passes: {list(AVAILABLE_COMPILER_PASSES.keys())}"
             )
-        compiler_passes.append(AVAILABLE_COMPILER_PASSES[pass_name])
+        if pass_name == "transformer_block_bucketing":
+            compiler_passes.append(
+                functools.partial(
+                    AVAILABLE_COMPILER_PASSES[pass_name],
+                    fsdp_manual_buckets=get_transformer_block_buckets(model),
+                )
+            )
+        else:
+            compiler_passes.append(AVAILABLE_COMPILER_PASSES[pass_name])
 
     if pass_names:
         logger.info(f"Using compiler passes from config: {pass_names}")
diff --git a/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py b/torchtitan/experiments/compiler_toolkit/llama3/parallelize.py
@@ -67,7 +67,7 @@ def parallelize_llama(
     joint_custom_passes = get_joint_custom_passes_from_config(parallel_dims, job_config)
 
     # Get compiler passes from config
-    compiler_passes = get_compiler_passes_from_config(job_config)
+    compiler_passes = get_compiler_passes_from_config(model, job_config)
 
     # Create compilers with specified passes (defaults to no passes)
     fw_compiler, bw_compiler = make_compiler_with_passes(
diff --git a/torchtitan/experiments/compiler_toolkit/passes.py b/torchtitan/experiments/compiler_toolkit/passes.py
@@ -12,6 +12,7 @@
 """
 
 import torch
+from torch._inductor.fx_passes.overlap_manual_scheduling import manual_overlap_bucketing
 from torch._inductor.fx_passes.overlap_scheduling import schedule_overlap_bucketing
 from torch.fx.passes.regional_inductor import regional_inductor
 from torchtitan.experiments.simple_fsdp.reshard_after_forward import (
@@ -26,13 +27,26 @@ def autobucketing_reordering_pass(
     Apply autobucketing and reordering optimization.
 
     This pass applies schedule_overlap_bucketing with collective_bucketing enabled
-    to optimize communication patterns in distributed training.
+    to optimize comm/compute overlap patterns in the graph.
     """
     schedule_overlap_bucketing(gm, collective_bucketing=True)
     gm.recompile()
     return gm
 
 
+def transformer_block_bucketing_reordering_pass(
+    gm: torch.fx.GraphModule, example_inputs, fsdp_manual_buckets
+) -> torch.fx.GraphModule:
+    """
+    Apply aten-level manual bucketing and reordering optimization.
+    """
+    manual_overlap_bucketing(
+        gm, module_bucket_plans=fsdp_manual_buckets, insert_overlap_deps=False
+    )
+    gm.recompile()
+    return gm
+
+
 def regional_inductor_pass(
     gm: torch.fx.GraphModule, example_inputs
 ) -> torch.fx.GraphModule:
@@ -72,5 +86,6 @@ def fsdp_reshard_after_fwd_pass(
 # Registry mapping pass names to pass functions
 AVAILABLE_COMPILER_PASSES = {
     "autobucketing_reordering": autobucketing_reordering_pass,
+    "transformer_block_bucketing": transformer_block_bucketing_reordering_pass,
     "regional_inductor": regional_inductor_pass,
 }
diff --git a/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py b/torchtitan/experiments/compiler_toolkit/tests/integration_tests.py
@@ -24,7 +24,6 @@ def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
                     "--model.name compiler_toolkit.llama3",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
-                    "--activation_checkpoint.mode none",
                 ],
             ],
             "llama3 FSDP+TP",
@@ -37,7 +36,6 @@ def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
                     "--model.name compiler_toolkit.llama3",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
-                    "--activation_checkpoint.mode none",
                     "--job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config",
                     "--compile.passes autobucketing_reordering",
                 ],
@@ -46,14 +44,27 @@ def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
             "llama3_fsdp_tp_autobucketing",
             ngpu=4,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.llama3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config",
+                    "--compile.passes transformer_block_bucketing",
+                ],
+            ],
+            "llama3 FSDP+TP manualbucketing",
+            "llama3_fsdp_tp_manualbucketing",
+            ngpu=4,
+        ),
         OverrideDefinitions(
             [
                 [
                     "--model.name compiler_toolkit.llama3",
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
                     "--model.flavor debugmodel_flex_attn",
-                    "--activation_checkpoint.mode none",
                 ],
             ],
             "llama3 FSDP+TP+FlexAttn",
@@ -67,7 +78,6 @@ def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
                     "--parallelism.data_parallel_shard_degree 2",
                     "--parallelism.tensor_parallel_degree 2",
                     "--model.flavor debugmodel_flex_attn",
-                    "--activation_checkpoint.mode none",
                     "--job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config",
                     "--compile.passes autobucketing_reordering,regional_inductor",
                 ],
@@ -76,6 +86,21 @@ def build_compiler_toolkit_test_list() -> list[OverrideDefinitions]:
             "llama3_fsdp_tp_flexattn_autobucketing_regional_inductor",
             ngpu=4,
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--model.name compiler_toolkit.llama3",
+                    "--parallelism.data_parallel_shard_degree 2",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--model.flavor debugmodel_flex_attn",
+                    "--job.custom_config_module=torchtitan.experiments.compiler_toolkit.job_config",
+                    "--compile.passes transformer_block_bucketing,regional_inductor",
+                ],
+            ],
+            "llama3 FSDP+TP+FlexAttn manualbucketing regional_inductor",
+            "llama3_fsdp_tp_flexattn_manualbucketing_regional_inductor",
+            ngpu=4,
+        ),
         # deepseek_v3 tests
         OverrideDefinitions(
             [
diff --git a/torchtitan/experiments/compiler_toolkit/tests/test_numerics.py b/torchtitan/experiments/compiler_toolkit/tests/test_numerics.py
@@ -42,11 +42,30 @@ def test_llama3_fsdp_tp_autobucketing(self):
             ac_mode="selective",
             steps=10,
             seed=42,
-            eager_tb_folder="tb/test_llama3_fsdp_tp_eager",
-            compiled_tb_folder="tb/test_llama3_fsdp_tp_compiled",
+            eager_tb_folder="tb/test_llama3_fsdp_tp_autobucketing_eager",
+            compiled_tb_folder="tb/test_llama3_fsdp_tp_autobucketing_compiled",
             metrics=["loss_metrics/global_avg_loss", "grad_norm"],
             passes="autobucketing_reordering",
         )
+        self.assertTrue(result, "Llama3 FSDP+TP+autobucketing numerics test failed")
+
+    def test_llama3_fsdp_tp_manualbucketing(self):
+        result = run_numerics_test(
+            ngpu=4,
+            config_file="./torchtitan/models/llama3/train_configs/debug_model.toml",
+            dp_shard_degree=2,
+            tp_degree=2,
+            cp_degree=1,
+            ep_degree=1,
+            ac_mode="selective",
+            steps=10,
+            seed=42,
+            eager_tb_folder="tb/test_llama3_fsdp_tp_manualbucketing_eager",
+            compiled_tb_folder="tb/test_llama3_fsdp_tp_manualbucketing_compiled",
+            metrics=["loss_metrics/global_avg_loss", "grad_norm"],
+            passes="transformer_block_bucketing",
+        )
+        self.assertTrue(result, "Llama3 FSDP+TP+manualbucketing numerics test failed")
 
     def test_deepseek_v3_fsdp_tp_ep(self):
         """Test DeepSeek V3 with FSDP + TP + EP configuration."""