Skip to content

Commit aa4183e

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit aa4183e

File tree

1 file changed

+34
-1
lines changed

1 file changed

+34
-1
lines changed

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import glob
12
import os
23
import unittest
34

@@ -11,6 +12,7 @@
1112
)
1213
from parameterized import parameterized
1314
from torch.testing._internal.common_utils import run_tests
15+
from torch_tensorrt._features import ENABLED_FEATURES
1416

1517

1618
def is_distributed_nccl_available():
@@ -31,6 +33,27 @@ def is_distributed_nccl_available():
3133
return False
3234

3335

36+
def cleanup_nccl_shared_memory():
37+
"""
38+
Clean up stale NCCL shared memory segments from /dev/shm.
39+
40+
In CI environments, previous test runs may leave behind NCCL shared memory
41+
segments that fill up /dev/shm. This function removes them to prevent
42+
"No space left on device" errors.
43+
"""
44+
try:
45+
nccl_files = glob.glob("/dev/shm/nccl-*")
46+
for f in nccl_files:
47+
try:
48+
os.remove(f)
49+
except (OSError, PermissionError):
50+
# Ignore errors if file is in use or we lack permissions
51+
pass
52+
except Exception:
53+
# If cleanup fails, continue anyway - test may still work
54+
pass
55+
56+
3457
if "OMPI_COMM_WORLD_SIZE" in os.environ:
3558
set_environment_variables_pytest_multi_process()
3659
else:
@@ -75,13 +98,20 @@ def forward(self, x):
7598

7699
class TestNcclOpsConverter(DispatchTestCase):
77100
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
101+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79102
@unittest.skipIf(
80103
not is_distributed_nccl_available(),
81104
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82105
)
106+
@unittest.skipIf(
107+
not ENABLED_FEATURES.trtllm_for_nccl,
108+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
109+
)
83110
@classmethod
84111
def setUpClass(cls):
112+
# Clean up stale NCCL shared memory from previous runs
113+
cleanup_nccl_shared_memory()
114+
85115
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86116
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87117
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +122,9 @@ def tearDownClass(cls):
92122
if dist.is_initialized():
93123
dist.destroy_process_group()
94124

125+
# Clean up NCCL shared memory after tests complete
126+
cleanup_nccl_shared_memory()
127+
95128
@parameterized.expand([8])
96129
def test_nccl_ops_gather(self, linear_layer_dim):
97130
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)