1+ import glob
12import os
23import unittest
34
1112)
1213from parameterized import parameterized
1314from torch .testing ._internal .common_utils import run_tests
15+ from torch_tensorrt ._features import ENABLED_FEATURES
1416
1517
1618def is_distributed_nccl_available ():
@@ -31,6 +33,27 @@ def is_distributed_nccl_available():
3133 return False
3234
3335
36+ def cleanup_nccl_shared_memory ():
37+ """
38+ Clean up stale NCCL shared memory segments from /dev/shm.
39+
40+ In CI environments, previous test runs may leave behind NCCL shared memory
41+ segments that fill up /dev/shm. This function removes them to prevent
42+ "No space left on device" errors.
43+ """
44+ try :
45+ nccl_files = glob .glob ("/dev/shm/nccl-*" )
46+ for f in nccl_files :
47+ try :
48+ os .remove (f )
49+ except (OSError , PermissionError ):
50+ # Ignore errors if file is in use or we lack permissions
51+ pass
52+ except Exception :
53+ # If cleanup fails, continue anyway - test may still work
54+ pass
55+
56+
3457if "OMPI_COMM_WORLD_SIZE" in os .environ :
3558 set_environment_variables_pytest_multi_process ()
3659else :
@@ -75,13 +98,20 @@ def forward(self, x):
7598
7699class TestNcclOpsConverter (DispatchTestCase ):
77100 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
101+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79102 @unittest .skipIf (
80103 not is_distributed_nccl_available (),
81104 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82105 )
106+ @unittest .skipIf (
107+ not ENABLED_FEATURES .trtllm_for_nccl ,
108+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
109+ )
83110 @classmethod
84111 def setUpClass (cls ):
112+ # Clean up stale NCCL shared memory from previous runs
113+ cleanup_nccl_shared_memory ()
114+
85115 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86116 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87117 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +122,9 @@ def tearDownClass(cls):
92122 if dist .is_initialized ():
93123 dist .destroy_process_group ()
94124
125+ # Clean up NCCL shared memory after tests complete
126+ cleanup_nccl_shared_memory ()
127+
95128 @parameterized .expand ([8 ])
96129 def test_nccl_ops_gather (self , linear_layer_dim ):
97130 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments