addressing L2 CI errors

apbose · apbose · commit aa4183e63b09 · 2025-12-01T23:00:06.000-08:00
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,3 +1,4 @@
+import glob
 import os
 import unittest
 
@@ -11,6 +12,7 @@
 )
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt._features import ENABLED_FEATURES
 
 
 def is_distributed_nccl_available():
@@ -31,6 +33,27 @@ def is_distributed_nccl_available():
         return False
 
 
+def cleanup_nccl_shared_memory():
+    """
+    Clean up stale NCCL shared memory segments from /dev/shm.
+
+    In CI environments, previous test runs may leave behind NCCL shared memory
+    segments that fill up /dev/shm. This function removes them to prevent
+    "No space left on device" errors.
+    """
+    try:
+        nccl_files = glob.glob("/dev/shm/nccl-*")
+        for f in nccl_files:
+            try:
+                os.remove(f)
+            except (OSError, PermissionError):
+                # Ignore errors if file is in use or we lack permissions
+                pass
+    except Exception:
+        # If cleanup fails, continue anyway - test may still work
+        pass
+
+
 if "OMPI_COMM_WORLD_SIZE" in os.environ:
     set_environment_variables_pytest_multi_process()
 else:
@@ -75,13 +98,20 @@ def forward(self, x):
 
 class TestNcclOpsConverter(DispatchTestCase):
     # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
-    # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
+    # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
     @unittest.skipIf(
         not is_distributed_nccl_available(),
         "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
     )
+    @unittest.skipIf(
+        not ENABLED_FEATURES.trtllm_for_nccl,
+        "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
+    )
     @classmethod
     def setUpClass(cls):
+        # Clean up stale NCCL shared memory from previous runs
+        cleanup_nccl_shared_memory()
+
         cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
         cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
         cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +122,9 @@ def tearDownClass(cls):
         if dist.is_initialized():
             dist.destroy_process_group()
 
+        # Clean up NCCL shared memory after tests complete
+        cleanup_nccl_shared_memory()
+
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]