1+ import glob
12import os
23import unittest
34
5+ os .environ .setdefault ("NCCL_SHM_DISABLE" , "1" )
6+
47import torch
58import torch .distributed as dist
69import torch .nn as nn
1114)
1215from parameterized import parameterized
1316from torch .testing ._internal .common_utils import run_tests
17+ from torch_tensorrt ._features import ENABLED_FEATURES
1418
1519
1620def is_distributed_nccl_available ():
@@ -31,11 +35,76 @@ def is_distributed_nccl_available():
3135 return False
3236
3337
38+ def get_shm_usage ():
39+ """Get /dev/shm usage statistics."""
40+ try :
41+ import shutil
42+
43+ total , used , free = shutil .disk_usage ("/dev/shm" )
44+ return {
45+ "total_mb" : total / (1024 * 1024 ),
46+ "used_mb" : used / (1024 * 1024 ),
47+ "free_mb" : free / (1024 * 1024 ),
48+ }
49+ except Exception as e :
50+ return {"error" : str (e )}
51+
52+
53+ def cleanup_nccl_shared_memory ():
54+ """
55+ Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+ Previous CI test runs may leave behind SHM files that cause
58+ "No space left on device" errors during NCCL initialization.
59+ """
60+ print ("\n " + "=" * 60 )
61+ print ("NCCL Shared Memory Cleanup" )
62+ print ("=" * 60 )
63+
64+ # Show /dev/shm usage before cleanup
65+ usage_before = get_shm_usage ()
66+ print (f"Before cleanup - /dev/shm usage: { usage_before } " )
67+
68+ patterns = [
69+ "/dev/shm/nccl-*" ,
70+ "/dev/shm/torch_*" ,
71+ "/dev/shm/py_shared_memory_*" ,
72+ "/dev/shm/*multiprocessing*" ,
73+ ]
74+
75+ total_files = 0
76+ total_bytes_freed = 0
77+
78+ for pattern in patterns :
79+ files = glob .glob (pattern )
80+ if files :
81+ print (f"\n Pattern: { pattern } " )
82+ for path in files :
83+ try :
84+ file_size = os .path .getsize (path )
85+ os .remove (path )
86+ total_files += 1
87+ total_bytes_freed += file_size
88+ print (f" Removed: { path } ({ file_size / (1024 * 1024 ):.2f} MB)" )
89+ except OSError as e :
90+ print (f" Failed to remove { path } : { e } " )
91+
92+ # Show /dev/shm usage after cleanup
93+ usage_after = get_shm_usage ()
94+ print (f"\n After cleanup - /dev/shm usage: { usage_after } " )
95+ print (f"Total files removed: { total_files } " )
96+ print (f"Total space freed: { total_bytes_freed / (1024 * 1024 ):.2f} MB" )
97+ print ("=" * 60 + "\n " )
98+
99+
34100if "OMPI_COMM_WORLD_SIZE" in os .environ :
35101 set_environment_variables_pytest_multi_process ()
36102else :
37103 set_environment_variables_pytest_single_process ()
38104
105+ # Clean up stale NCCL shared memory BEFORE initializing process group
106+ cleanup_nccl_shared_memory ()
107+
39108if not dist .is_initialized ():
40109 dist .init_process_group (
41110 backend = "nccl" ,
@@ -75,13 +144,21 @@ def forward(self, x):
75144
76145class TestNcclOpsConverter (DispatchTestCase ):
77146 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
147+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79148 @unittest .skipIf (
80149 not is_distributed_nccl_available (),
81150 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82151 )
152+ @unittest .skipIf (
153+ not ENABLED_FEATURES .trtllm_for_nccl ,
154+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
155+ )
83156 @classmethod
84157 def setUpClass (cls ):
158+ # Clean up stale NCCL shared memory from previous runs
159+ # to see if this is needed
160+ cleanup_nccl_shared_memory ()
161+
85162 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86163 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87164 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +169,9 @@ def tearDownClass(cls):
92169 if dist .is_initialized ():
93170 dist .destroy_process_group ()
94171
172+ # Clean up NCCL shared memory after tests complete
173+ cleanup_nccl_shared_memory ()
174+
95175 @parameterized .expand ([8 ])
96176 def test_nccl_ops_gather (self , linear_layer_dim ):
97177 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments