Skip to content

Commit 2ea29e4

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit 2ea29e4

File tree

1 file changed

+44
-1
lines changed

1 file changed

+44
-1
lines changed

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import glob
12
import os
23
import unittest
34

@@ -11,6 +12,7 @@
1112
)
1213
from parameterized import parameterized
1314
from torch.testing._internal.common_utils import run_tests
15+
from torch_tensorrt._features import ENABLED_FEATURES
1416

1517

1618
def is_distributed_nccl_available():
@@ -31,11 +33,41 @@ def is_distributed_nccl_available():
3133
return False
3234

3335

36+
def cleanup_nccl_shared_memory():
37+
"""
38+
Clean up stale NCCL shared memory segments from /dev/shm.
39+
40+
In CI environments, previous test runs may leave behind NCCL shared memory
41+
segments that fill up /dev/shm. This function removes them to prevent
42+
"No space left on device" errors.
43+
"""
44+
try:
45+
nccl_files = glob.glob("/dev/shm/nccl-*")
46+
for f in nccl_files:
47+
try:
48+
os.remove(f)
49+
except (OSError, PermissionError):
50+
# Ignore errors if file is in use or we lack permissions
51+
pass
52+
except Exception:
53+
# If cleanup fails, continue anyway - test may still work
54+
pass
55+
56+
3457
if "OMPI_COMM_WORLD_SIZE" in os.environ:
3558
set_environment_variables_pytest_multi_process()
3659
else:
3760
set_environment_variables_pytest_single_process()
3861

62+
# Clean up stale NCCL shared memory BEFORE initializing process group
63+
cleanup_nccl_shared_memory()
64+
65+
# Configure NCCL to use less shared memory in constrained CI environments
66+
# NCCL_SHM_DISABLE=1 disables shared memory transport, uses sockets instead
67+
# This is slower but avoids /dev/shm space issues
68+
if not os.environ.get("NCCL_SHM_DISABLE"):
69+
os.environ["NCCL_SHM_DISABLE"] = "1"
70+
3971
if not dist.is_initialized():
4072
dist.init_process_group(
4173
backend="nccl",
@@ -75,13 +107,21 @@ def forward(self, x):
75107

76108
class TestNcclOpsConverter(DispatchTestCase):
77109
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
110+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79111
@unittest.skipIf(
80112
not is_distributed_nccl_available(),
81113
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82114
)
115+
@unittest.skipIf(
116+
not ENABLED_FEATURES.trtllm_for_nccl,
117+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
118+
)
83119
@classmethod
84120
def setUpClass(cls):
121+
# Clean up stale NCCL shared memory from previous runs
122+
# to see if this is needed
123+
cleanup_nccl_shared_memory()
124+
85125
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86126
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87127
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +132,9 @@ def tearDownClass(cls):
92132
if dist.is_initialized():
93133
dist.destroy_process_group()
94134

135+
# Clean up NCCL shared memory after tests complete
136+
cleanup_nccl_shared_memory()
137+
95138
@parameterized.expand([8])
96139
def test_nccl_ops_gather(self, linear_layer_dim):
97140
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)