[Local Tensor] Replace dry_run.py with local tensor mode implementation

fegin · fegin · commit 4bd18f55520e · 2025-11-18T13:36:21.000-08:00
Replaces `dry_run.py` implementation with local tensor mode for DRY_RUN configuration validation. Local tensor mode provides deeper validation coverage, including `ParallelDims` creation, which the previous implementation could not verify. **Note:** Currently returns early before `init_weights()` due to a known limitation in local tensor mode. This still validates more of the pipeline than the previous approach. ghstack-source-id: c37e849 Pull-Request: #2057
diff --git a/run_train.sh b/run_train.sh
@@ -22,7 +22,7 @@ TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
 if [ "$DRY_RUN" = "1" ]; then
     # Dry run mode: validate configuration without GPU/distributed setup
     echo "Running in DRY RUN mode - configuration validation only"
-    python scripts/dry_run.py --job.config_file ${CONFIG_FILE} "$@"
+    NGPU="${NGPU}" LOCAL_RANK=0 python3 -m "${TRAIN_FILE}" --job.config_file "${CONFIG_FILE}" "$@" --comm.local_tensor_mode
 else
     # Normal training with torchrun
     PYTORCH_ALLOC_CONF="expandable_segments:True" \
diff --git a/scripts/dry_run.py b/scripts/dry_run.py
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -791,6 +791,9 @@ class Comm:
     save_traces_file_prefix: str = "rank_"
     """Flight recorder trace files prefix"""
 
+    local_tensor_mode: bool = False
+    """Local tensor mode, for debugging purposes. This is an experimental feature."""
+
 
 @dataclass
 class MemoryEstimation:
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -14,6 +14,7 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed.distributed_c10d as c10d
 from torch import distributed as dist
+from torch.distributed import _local_tensor
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import DTensor
 
@@ -258,12 +259,45 @@ def maybe_enable_amp(
             )
 
 
+def init_local_tensor_mode(world_size: int) -> int:
+    """Initialize local tensor mode for debugging purposes.
+
+    Args:
+        world_size: The number of GPUs to simulate
+
+    Returns:
+        The world size
+    """
+    torch.distributed.init_process_group(
+        "fake",
+        rank=0,
+        world_size=world_size,
+    )
+    lm = _local_tensor.LocalTensorMode(world_size)
+    lm.__enter__()
+    return world_size
+
+
 def init_distributed(
     comm_config: CommConfig,
     enable_cpu_backend: bool = False,
     base_folder: str = "",
     ranks: list[int] | None = None,
-):
+) -> int:
+    if comm_config.local_tensor_mode:
+        ngpu_str = os.environ.get("NGPU")
+        if ngpu_str is None:
+            raise ValueError(
+                "NGPU environment variable must be set when using local_tensor_mode"
+            )
+        try:
+            world_size = int(ngpu_str)
+        except ValueError as e:
+            raise ValueError(
+                f"NGPU environment variable must be a valid integer, got: {ngpu_str}"
+            ) from e
+        return init_local_tensor_mode(world_size)
+
     def _warn_overwrite_env(env, val):
         if env in os.environ:
             logger.warning(
@@ -309,6 +343,8 @@ def _get_distributed_backend(enable_cpu_backend):
         _ranks=ranks if ranks is not None else [],
     )
 
+    return torch.distributed.get_world_size()
+
 
 def set_pg_timeouts(timeout, world_mesh):
     """
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -208,6 +208,12 @@ def __init__(self, job_config: JobConfig):
             self.loss_fn, self.gradient_accumulation_steps
         )
 
+        # TODO(local_tensor): Remove this early return once LocalTensor supports
+        # init_weights().Currently skipping parallelism setup and model initialization
+        # in local tensor mode.
+        if job_config.comm.local_tensor_mode:
+            return
+
         # apply parallelisms and initialization
         if parallel_dims.pp_enabled:
             if not self.train_spec.pipelining_fn:
@@ -360,13 +366,12 @@ def __init__(self, job_config: JobConfig):
 
     def init_distributed(self) -> ParallelDims:
         job_config = self.job_config
-        dist_utils.init_distributed(
+        world_size = dist_utils.init_distributed(
             job_config.comm,
             enable_cpu_backend=job_config.training.enable_cpu_offload,
             base_folder=job_config.job.dump_folder,
         )
 
-        world_size = int(os.environ["WORLD_SIZE"])
         parallelism_config = job_config.parallelism
 
         return ParallelDims(
@@ -718,6 +723,13 @@ def main(trainer_class: type[Trainer]) -> None:
     try:
         trainer = trainer_class(config)
 
+        # TODO(local_tensor): Remove this special case once LocalTensor supports
+        # init_weights(). In local tensor mode, skip training/checkpointing as the
+        # model is not fully initialized
+        if config.comm.local_tensor_mode:
+            logger.info("Local tensor mode enabled - skipping training execution")
+            return
+
         if config.checkpoint.create_seed_checkpoint:
             assert (
                 int(os.environ["WORLD_SIZE"]) == 1