1+ import glob
12import os
23import unittest
34
5+ os .environ .setdefault ("NCCL_SHM_DISABLE" , "1" )
6+
47import torch
58import torch .distributed as dist
69import torch .nn as nn
1114)
1215from parameterized import parameterized
1316from torch .testing ._internal .common_utils import run_tests
17+ from torch_tensorrt ._features import ENABLED_FEATURES
1418
1519
1620def is_distributed_nccl_available ():
@@ -31,11 +35,101 @@ def is_distributed_nccl_available():
3135 return False
3236
3337
38+ def get_shm_usage ():
39+ """Get /dev/shm usage statistics."""
40+ try :
41+ import shutil
42+
43+ total , used , free = shutil .disk_usage ("/dev/shm" )
44+ return {
45+ "total_mb" : total / (1024 * 1024 ),
46+ "used_mb" : used / (1024 * 1024 ),
47+ "free_mb" : free / (1024 * 1024 ),
48+ }
49+ except Exception as e :
50+ return {"error" : str (e )}
51+
52+
53+ def cleanup_nccl_shared_memory ():
54+ """
55+ Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+ Previous CI test runs may leave behind SHM files that cause
58+ "No space left on device" errors during NCCL initialization.
59+ """
60+ print ("\n " + "=" * 60 )
61+ print ("NCCL Shared Memory Cleanup" )
62+ print ("=" * 60 )
63+
64+ # Show /dev/shm usage before cleanup
65+ usage_before = get_shm_usage ()
66+ print (f"Before cleanup - /dev/shm usage: { usage_before } " )
67+
68+ # List ALL files in /dev/shm to see what's consuming space
69+ print ("\n All files in /dev/shm:" )
70+ try :
71+ shm_files = []
72+ for f in os .listdir ("/dev/shm" ):
73+ path = os .path .join ("/dev/shm" , f )
74+ try :
75+ size = os .path .getsize (path )
76+ shm_files .append ((path , size ))
77+ except OSError :
78+ shm_files .append ((path , - 1 ))
79+
80+ # Sort by size descending
81+ shm_files .sort (key = lambda x : x [1 ], reverse = True )
82+ for path , size in shm_files :
83+ if size >= 0 :
84+ print (f" { path } : { size / (1024 * 1024 ):.2f} MB" )
85+ else :
86+ print (f" { path } : <unable to get size>" )
87+
88+ if not shm_files :
89+ print (" (no files found)" )
90+ except Exception as e :
91+ print (f" Error listing /dev/shm: { e } " )
92+
93+ patterns = [
94+ "/dev/shm/nccl-*" ,
95+ "/dev/shm/torch_*" ,
96+ "/dev/shm/py_shared_memory_*" ,
97+ "/dev/shm/*multiprocessing*" ,
98+ ]
99+
100+ total_files = 0
101+ total_bytes_freed = 0
102+
103+ for pattern in patterns :
104+ files = glob .glob (pattern )
105+ if files :
106+ print (f"\n Pattern: { pattern } " )
107+ for path in files :
108+ try :
109+ file_size = os .path .getsize (path )
110+ os .remove (path )
111+ total_files += 1
112+ total_bytes_freed += file_size
113+ print (f" Removed: { path } ({ file_size / (1024 * 1024 ):.2f} MB)" )
114+ except OSError as e :
115+ print (f" Failed to remove { path } : { e } " )
116+
117+ # Show /dev/shm usage after cleanup
118+ usage_after = get_shm_usage ()
119+ print (f"\n After cleanup - /dev/shm usage: { usage_after } " )
120+ print (f"Total files removed: { total_files } " )
121+ print (f"Total space freed: { total_bytes_freed / (1024 * 1024 ):.2f} MB" )
122+ print ("=" * 60 + "\n " )
123+
124+
34125if "OMPI_COMM_WORLD_SIZE" in os .environ :
35126 set_environment_variables_pytest_multi_process ()
36127else :
37128 set_environment_variables_pytest_single_process ()
38129
130+ # Clean up stale NCCL shared memory BEFORE initializing process group
131+ cleanup_nccl_shared_memory ()
132+
39133if not dist .is_initialized ():
40134 dist .init_process_group (
41135 backend = "nccl" ,
@@ -75,13 +169,21 @@ def forward(self, x):
75169
76170class TestNcclOpsConverter (DispatchTestCase ):
77171 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
172+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79173 @unittest .skipIf (
80174 not is_distributed_nccl_available (),
81175 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82176 )
177+ @unittest .skipIf (
178+ not ENABLED_FEATURES .trtllm_for_nccl ,
179+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
180+ )
83181 @classmethod
84182 def setUpClass (cls ):
183+ # Clean up stale NCCL shared memory from previous runs
184+ # to see if this is needed
185+ cleanup_nccl_shared_memory ()
186+
85187 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86188 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87189 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +194,9 @@ def tearDownClass(cls):
92194 if dist .is_initialized ():
93195 dist .destroy_process_group ()
94196
197+ # Clean up NCCL shared memory after tests complete
198+ cleanup_nccl_shared_memory ()
199+
95200 @parameterized .expand ([8 ])
96201 def test_nccl_ops_gather (self , linear_layer_dim ):
97202 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments