Skip to content

Commit f8befae

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit f8befae

File tree

1 file changed

+106
-1
lines changed

1 file changed

+106
-1
lines changed

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 106 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import glob
12
import os
23
import unittest
34

5+
os.environ.setdefault("NCCL_SHM_DISABLE", "1")
6+
47
import torch
58
import torch.distributed as dist
69
import torch.nn as nn
@@ -11,6 +14,7 @@
1114
)
1215
from parameterized import parameterized
1316
from torch.testing._internal.common_utils import run_tests
17+
from torch_tensorrt._features import ENABLED_FEATURES
1418

1519

1620
def is_distributed_nccl_available():
@@ -31,11 +35,101 @@ def is_distributed_nccl_available():
3135
return False
3236

3337

38+
def get_shm_usage():
39+
"""Get /dev/shm usage statistics."""
40+
try:
41+
import shutil
42+
43+
total, used, free = shutil.disk_usage("/dev/shm")
44+
return {
45+
"total_mb": total / (1024 * 1024),
46+
"used_mb": used / (1024 * 1024),
47+
"free_mb": free / (1024 * 1024),
48+
}
49+
except Exception as e:
50+
return {"error": str(e)}
51+
52+
53+
def cleanup_nccl_shared_memory():
54+
"""
55+
Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+
Previous CI test runs may leave behind SHM files that cause
58+
"No space left on device" errors during NCCL initialization.
59+
"""
60+
print("\n" + "=" * 60)
61+
print("NCCL Shared Memory Cleanup")
62+
print("=" * 60)
63+
64+
# Show /dev/shm usage before cleanup
65+
usage_before = get_shm_usage()
66+
print(f"Before cleanup - /dev/shm usage: {usage_before}")
67+
68+
# List ALL files in /dev/shm to see what's consuming space
69+
print("\nAll files in /dev/shm:")
70+
try:
71+
shm_files = []
72+
for f in os.listdir("/dev/shm"):
73+
path = os.path.join("/dev/shm", f)
74+
try:
75+
size = os.path.getsize(path)
76+
shm_files.append((path, size))
77+
except OSError:
78+
shm_files.append((path, -1))
79+
80+
# Sort by size descending
81+
shm_files.sort(key=lambda x: x[1], reverse=True)
82+
for path, size in shm_files:
83+
if size >= 0:
84+
print(f" {path}: {size / (1024 * 1024):.2f} MB")
85+
else:
86+
print(f" {path}: <unable to get size>")
87+
88+
if not shm_files:
89+
print(" (no files found)")
90+
except Exception as e:
91+
print(f" Error listing /dev/shm: {e}")
92+
93+
patterns = [
94+
"/dev/shm/nccl-*",
95+
"/dev/shm/torch_*",
96+
"/dev/shm/py_shared_memory_*",
97+
"/dev/shm/*multiprocessing*",
98+
]
99+
100+
total_files = 0
101+
total_bytes_freed = 0
102+
103+
for pattern in patterns:
104+
files = glob.glob(pattern)
105+
if files:
106+
print(f"\nPattern: {pattern}")
107+
for path in files:
108+
try:
109+
file_size = os.path.getsize(path)
110+
os.remove(path)
111+
total_files += 1
112+
total_bytes_freed += file_size
113+
print(f" Removed: {path} ({file_size / (1024 * 1024):.2f} MB)")
114+
except OSError as e:
115+
print(f" Failed to remove {path}: {e}")
116+
117+
# Show /dev/shm usage after cleanup
118+
usage_after = get_shm_usage()
119+
print(f"\nAfter cleanup - /dev/shm usage: {usage_after}")
120+
print(f"Total files removed: {total_files}")
121+
print(f"Total space freed: {total_bytes_freed / (1024 * 1024):.2f} MB")
122+
print("=" * 60 + "\n")
123+
124+
34125
if "OMPI_COMM_WORLD_SIZE" in os.environ:
35126
set_environment_variables_pytest_multi_process()
36127
else:
37128
set_environment_variables_pytest_single_process()
38129

130+
# Clean up stale NCCL shared memory BEFORE initializing process group
131+
cleanup_nccl_shared_memory()
132+
39133
if not dist.is_initialized():
40134
dist.init_process_group(
41135
backend="nccl",
@@ -75,13 +169,21 @@ def forward(self, x):
75169

76170
class TestNcclOpsConverter(DispatchTestCase):
77171
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
172+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79173
@unittest.skipIf(
80174
not is_distributed_nccl_available(),
81175
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82176
)
177+
@unittest.skipIf(
178+
not ENABLED_FEATURES.trtllm_for_nccl,
179+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
180+
)
83181
@classmethod
84182
def setUpClass(cls):
183+
# Clean up stale NCCL shared memory from previous runs
184+
# to see if this is needed
185+
cleanup_nccl_shared_memory()
186+
85187
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86188
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87189
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +194,9 @@ def tearDownClass(cls):
92194
if dist.is_initialized():
93195
dist.destroy_process_group()
94196

197+
# Clean up NCCL shared memory after tests complete
198+
cleanup_nccl_shared_memory()
199+
95200
@parameterized.expand([8])
96201
def test_nccl_ops_gather(self, linear_layer_dim):
97202
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)