pytorch · xmfan · Nov 25, 2025 · Jun 13, 2025 · Jun 27, 2025 · Jul 25, 2025
@@ -340,6 +340,11 @@ def build_optimizers_with_moe_load_balancing(
         ft_manager=ft_manager,
     )
 
+    def should_manual_allreduce(tokens_per_expert_by_layer):
+        return not isinstance(
+            tokens_per_expert_by_layer, torch.distributed.tensor.DTensor
+        )
+
     def _should_register_moe_balancing_hook(model_parts: list[nn.Module]) -> bool:
         for model_part in model_parts:
             for transformer_block in model_part.layers.values():
@@ -380,11 +385,14 @@ def _update_expert_bias(
         tokens_per_expert_by_layer = torch.vstack(tokens_per_expert_list)
 
         if dp_cp_mesh is not None:
-            # Perform single all-reduce to get global statistics across all processes
-            pg = dp_cp_mesh.get_group()
-            torch.distributed.all_reduce(
-                tokens_per_expert_by_layer, group=pg, op=torch.distributed.ReduceOp.SUM
-            )
+            if should_manual_allreduce(tokens_per_expert_by_layer):
+                # Perform single all-reduce to get global statistics across all processes
+                pg = dp_cp_mesh.get_group()
+                torch.distributed.all_reduce(
+                    tokens_per_expert_by_layer,
+                    group=pg,
+                    op=torch.distributed.ReduceOp.SUM,
+                )
 
         moe_layer_idx = 0
         with torch.no_grad():

diff --git a/torchtitan/experiments/README.md b/torchtitan/experiments/README.md
@@ -32,3 +32,4 @@ We provide this `experiments/` folder to host experiments that add significant v
 | [gpt_oss](./gpt_oss/) | TBA | [@jianiw](https://github.com/jianiw) |
 | [compiler_toolkit](./compiler_toolkit/) | [![Compiler Toolkit 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_compiler_toolkit.yaml?query=branch%3Amain) | [@SherlockNoMad](https://github.com/SherlockNoMad) [@yiming0416](https://github.com/yiming0416) |
 | [transformers_backend](./transformers_backend/) | [![Transformers backend 8 GPU Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu_transformers_backend.yaml?query=branch%3Amain) | [@3outeille](https://github.com/3outeille) |
+| [auto_parallel](./auto_parallel/) | TBA | [@wconstab](https://github.com/wconstab) | [@xmfan](https://github.com/xmfan) |
diff --git a/torchtitan/experiments/__init__.py b/torchtitan/experiments/__init__.py
@@ -13,5 +13,7 @@
         "compiler_toolkit.deepseek_v3",
         "compiler_toolkit.llama3",
         "transformers_backend",
+        "auto_parallel.llama3",
+        "auto_parallel.deepseek_v3",
     ]
 )
diff --git a/torchtitan/experiments/auto_parallel/README.md b/torchtitan/experiments/auto_parallel/README.md
@@ -0,0 +1,19 @@
+## Auto Parallel
+
+### Overview
+
+The Auto Parallel experiment integrates PyTorch's AutoParallel framework with TorchTitan to automatically optimize distributed training parallelism strategies given a device mesh. Instead of manually configuring parallelism layouts, AutoParallel uses cost-based analysis to determine optimal sharding placements for model parameters, activations, and gradients.
+
+### Requirements
+
+Requires installing [git@github.com:meta-pytorch/autoparallel.git](https://github.com/meta-pytorch/autoparallel)
+
+### Single Node
+
+**Llama3**
+
+`CONFIG_FILE=./torchtitan/models/llama3/train_configs/debug_model.toml ./run_train.sh --model.name auto_parallel.llama3 --parallelism.tensor_parallel_degree 4 --job.custom_config_module=torchtitan.experiments.auto_parallel.job_config`
+
+**DeepSeekv3**
+
+`CONFIG_FILE=./torchtitan/models/deepseek_v3/train_configs/debug_model.toml ./run_train.sh --model.name auto_parallel.deepseek_v3 --job.custom_config_module=torchtitan.experiments.auto_parallel.job_config`
diff --git a/torchtitan/experiments/auto_parallel/deepseek_v3/__init__.py b/torchtitan/experiments/auto_parallel/deepseek_v3/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+
+import copy
+
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers_with_moe_load_balancing
+from torchtitan.components.tokenizer import build_hf_tokenizer
+from torchtitan.distributed.pipeline_parallel import pipeline_llm
+from torchtitan.hf_datasets.text_datasets import build_text_dataloader
+
+from torchtitan.models.deepseek_v3 import deepseekv3_args, DeepSeekV3Model
+from torchtitan.models.deepseek_v3.model.args import DeepSeekV3ModelArgs
+from torchtitan.models.deepseek_v3.model.state_dict_adapter import (
+    DeepSeekV3StateDictAdapter,
+)
+from torchtitan.protocols.train_spec import TrainSpec
+
+from .parallelize_deepseekv3 import parallelize_deepseekv3
+
+
+def get_train_spec() -> TrainSpec:
+    model_args = copy.deepcopy(deepseekv3_args)
+
+    default_args = DeepSeekV3ModelArgs()
+    for config, args in model_args.items():
+        if "flex_attn" in config:
+            continue
+
+        use_flex_attn = (default_args.use_flex_attn,)
+        attn_mask_type = (default_args.attn_mask_type,)
+
+    return TrainSpec(
+        model_cls=DeepSeekV3Model,
+        model_args=model_args,
+        parallelize_fn=parallelize_deepseekv3,
+        pipelining_fn=pipeline_llm,
+        build_optimizers_fn=build_optimizers_with_moe_load_balancing,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_text_dataloader,
+        build_tokenizer_fn=build_hf_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+        state_dict_adapter=DeepSeekV3StateDictAdapter,
+    )