[Local Tensor] Replace dry_run.py with local tensor mode implementation

fegin · fegin · commit 88669aa8e05d · 2025-11-19T18:16:08.000-08:00
Replaces `dry_run.py` implementation with local tensor mode for DRY_RUN configuration validation. Local tensor mode provides deeper validation coverage, including `ParallelDims` creation, which the previous implementation could not verify. **Note:** Currently returns early before `init_weights()` due to a known limitation in local tensor mode. This still validates more of the pipeline than the previous approach. ghstack-source-id: 27b8bad Pull-Request: #2057
diff --git a/run_train.sh b/run_train.sh
@@ -22,7 +22,7 @@ TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
 if [ "$DRY_RUN" = "1" ]; then
     # Dry run mode: validate configuration without GPU/distributed setup
     echo "Running in DRY RUN mode - configuration validation only"
-    python scripts/dry_run.py --job.config_file ${CONFIG_FILE} "$@"
+    NGPU="${NGPU}" LOCAL_RANK=0 python3 -m "${TRAIN_FILE}" --job.config_file "${CONFIG_FILE}" "$@" --comm.fake_backend --training.steps=1
 else
     # Normal training with torchrun
     PYTORCH_ALLOC_CONF="expandable_segments:True" \
diff --git a/scripts/dry_run.py b/scripts/dry_run.py
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -791,6 +791,24 @@ class Comm:
     save_traces_file_prefix: str = "rank_"
     """Flight recorder trace files prefix"""
 
+    fake_backend: bool = False
+    """Fake comm backend for dry run mode only"""
+
+    local_tensor_mode: bool = False
+    """
+    Local tensor mode for debugging purposes. There will be only one process
+    regardless of the number of GPUs. LocalTensor will simulate the
+    computation by running one rank after another. While the performance will
+    be slow, the numerics should be the same. This enables us to verify
+    numerics with fewer GPUs. For example, we can directly run 5D
+    parallelisms within a single node to reduce the combinations we need to
+    use in integration tests.
+
+    NOTE: This is an experimental feature.
+
+    NOTE: fake_backend should be set to True when local_tensor_mode is True.
+    """
+
 
 @dataclass
 class MemoryEstimation:
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -258,12 +258,43 @@ def maybe_enable_amp(
             )
 
 
+def init_fake_mode(world_size: int) -> int:
+    """Initialize fake backend
+
+    Args:
+        world_size: The number of GPUs to simulate
+
+    Returns:
+        The world size
+    """
+    torch.distributed.init_process_group(
+        "fake",
+        rank=0,
+        world_size=world_size,
+    )
+    return world_size
+
+
 def init_distributed(
     comm_config: CommConfig,
     enable_cpu_backend: bool = False,
     base_folder: str = "",
     ranks: list[int] | None = None,
-):
+) -> int:
+    if comm_config.fake_backend:
+        ngpu_str = os.environ.get("NGPU")
+        if ngpu_str is None:
+            raise ValueError(
+                "NGPU environment variable must be set when using local_tensor_mode"
+            )
+        try:
+            world_size = int(ngpu_str)
+        except ValueError as e:
+            raise ValueError(
+                f"NGPU environment variable must be a valid integer, got: {ngpu_str}"
+            ) from e
+        return init_fake_mode(world_size)
+
     def _warn_overwrite_env(env, val):
         if env in os.environ:
             logger.warning(
@@ -309,6 +340,8 @@ def _get_distributed_backend(enable_cpu_backend):
         _ranks=ranks if ranks is not None else [],
     )
 
+    return torch.distributed.get_world_size()
+
 
 def set_pg_timeouts(timeout, world_mesh):
     """
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -11,6 +11,7 @@
 from typing import Any, Generator, Iterable
 
 import torch
+from torch.distributed import _local_tensor
 
 from torch.distributed.elastic.multiprocessing.errors import record
 
@@ -208,6 +209,12 @@ def __init__(self, job_config: JobConfig):
             self.loss_fn, self.gradient_accumulation_steps
         )
 
+        # TODO(local_tensor): Remove this early return once LocalTensor supports
+        # init_weights().Currently skipping parallelism setup and model initialization
+        # in local tensor mode.
+        if job_config.comm.local_tensor_mode:
+            return
+
         # apply parallelisms and initialization
         if parallel_dims.pp_enabled:
             if not self.train_spec.pipelining_fn:
@@ -360,15 +367,19 @@ def __init__(self, job_config: JobConfig):
 
     def init_distributed(self) -> ParallelDims:
         job_config = self.job_config
-        dist_utils.init_distributed(
+        world_size = dist_utils.init_distributed(
             job_config.comm,
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
 
-        world_size = int(os.environ["WORLD_SIZE"])
-        parallelism_config = job_config.parallelism
+        if job_config.comm.local_tensor_mode:
+            if not job_config.comm.fake_backend:
+                raise ValueError("LocalTensor can only be used with fake backend.")
+            lm = _local_tensor.LocalTensorMode(world_size)
+            lm.__enter__()
 
+        parallelism_config = job_config.parallelism
         return ParallelDims(
             dp_shard=parallelism_config.data_parallel_shard_degree,
             dp_replicate=parallelism_config.data_parallel_replicate_degree,
@@ -718,6 +729,13 @@ def main(trainer_class: type[Trainer]) -> None:
     try:
         trainer = trainer_class(config)
 
+        # TODO(local_tensor): Remove this special case once LocalTensor supports
+        # init_weights(). In local tensor mode, skip training/checkpointing as the
+        # model is not fully initialized
+        if config.comm.local_tensor_mode:
+            logger.info("Local tensor mode enabled - skipping training execution")
+            return
+
         if config.checkpoint.create_seed_checkpoint:
             assert (
                 int(os.environ["WORLD_SIZE"]) == 1