addressing L2 CI errors

apbose · apbose · commit 6e91c4e4078a · 2025-12-03T06:38:04.000-08:00
diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml
@@ -80,7 +80,7 @@ jobs:
     runs-on: ${{ matrix.validation_runner }}
     container:
       image: ${{ matrix.container_image }}
-      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
+      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
     # If a build is taking longer than 120 minutes on these runners we need
     # to have a conversation
     timeout-minutes: 120
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -11,6 +11,7 @@
 )
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt._features import ENABLED_FEATURES
 
 
 def is_distributed_nccl_available():
@@ -75,11 +76,15 @@ def forward(self, x):
 
 class TestNcclOpsConverter(DispatchTestCase):
     # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
-    # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
+    # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
     @unittest.skipIf(
         not is_distributed_nccl_available(),
         "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
     )
+    @unittest.skipIf(
+        not ENABLED_FEATURES.trtllm_for_nccl,
+        "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
+    )
     @classmethod
     def setUpClass(cls):
         cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))