Skip to content

Commit 6833fec

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit 6833fec

File tree

1 file changed

+81
-1
lines changed

1 file changed

+81
-1
lines changed

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import glob
12
import os
23
import unittest
34

5+
os.environ.setdefault("NCCL_SHM_DISABLE", "1")
6+
47
import torch
58
import torch.distributed as dist
69
import torch.nn as nn
@@ -11,6 +14,7 @@
1114
)
1215
from parameterized import parameterized
1316
from torch.testing._internal.common_utils import run_tests
17+
from torch_tensorrt._features import ENABLED_FEATURES
1418

1519

1620
def is_distributed_nccl_available():
@@ -31,11 +35,76 @@ def is_distributed_nccl_available():
3135
return False
3236

3337

38+
def get_shm_usage():
39+
"""Get /dev/shm usage statistics."""
40+
try:
41+
import shutil
42+
43+
total, used, free = shutil.disk_usage("/dev/shm")
44+
return {
45+
"total_mb": total / (1024 * 1024),
46+
"used_mb": used / (1024 * 1024),
47+
"free_mb": free / (1024 * 1024),
48+
}
49+
except Exception as e:
50+
return {"error": str(e)}
51+
52+
53+
def cleanup_nccl_shared_memory():
54+
"""
55+
Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+
Previous CI test runs may leave behind SHM files that cause
58+
"No space left on device" errors during NCCL initialization.
59+
"""
60+
print("\n" + "=" * 60)
61+
print("NCCL Shared Memory Cleanup")
62+
print("=" * 60)
63+
64+
# Show /dev/shm usage before cleanup
65+
usage_before = get_shm_usage()
66+
print(f"Before cleanup - /dev/shm usage: {usage_before}")
67+
68+
patterns = [
69+
"/dev/shm/nccl-*",
70+
"/dev/shm/torch_*",
71+
"/dev/shm/py_shared_memory_*",
72+
"/dev/shm/*multiprocessing*",
73+
]
74+
75+
total_files = 0
76+
total_bytes_freed = 0
77+
78+
for pattern in patterns:
79+
files = glob.glob(pattern)
80+
if files:
81+
print(f"\nPattern: {pattern}")
82+
for path in files:
83+
try:
84+
file_size = os.path.getsize(path)
85+
os.remove(path)
86+
total_files += 1
87+
total_bytes_freed += file_size
88+
print(f" Removed: {path} ({file_size / (1024 * 1024):.2f} MB)")
89+
except OSError as e:
90+
print(f" Failed to remove {path}: {e}")
91+
92+
# Show /dev/shm usage after cleanup
93+
usage_after = get_shm_usage()
94+
print(f"\nAfter cleanup - /dev/shm usage: {usage_after}")
95+
print(f"Total files removed: {total_files}")
96+
print(f"Total space freed: {total_bytes_freed / (1024 * 1024):.2f} MB")
97+
print("=" * 60 + "\n")
98+
99+
34100
if "OMPI_COMM_WORLD_SIZE" in os.environ:
35101
set_environment_variables_pytest_multi_process()
36102
else:
37103
set_environment_variables_pytest_single_process()
38104

105+
# Clean up stale NCCL shared memory BEFORE initializing process group
106+
cleanup_nccl_shared_memory()
107+
39108
if not dist.is_initialized():
40109
dist.init_process_group(
41110
backend="nccl",
@@ -75,13 +144,21 @@ def forward(self, x):
75144

76145
class TestNcclOpsConverter(DispatchTestCase):
77146
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
147+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79148
@unittest.skipIf(
80149
not is_distributed_nccl_available(),
81150
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82151
)
152+
@unittest.skipIf(
153+
not ENABLED_FEATURES.trtllm_for_nccl,
154+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
155+
)
83156
@classmethod
84157
def setUpClass(cls):
158+
# Clean up stale NCCL shared memory from previous runs
159+
# to see if this is needed
160+
cleanup_nccl_shared_memory()
161+
85162
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86163
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87164
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +169,9 @@ def tearDownClass(cls):
92169
if dist.is_initialized():
93170
dist.destroy_process_group()
94171

172+
# Clean up NCCL shared memory after tests complete
173+
cleanup_nccl_shared_memory()
174+
95175
@parameterized.expand([8])
96176
def test_nccl_ops_gather(self, linear_layer_dim):
97177
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)