Set VLLM_DISABLE_SHARED_EXPERTS_STREAM=1 by default for TPU inference (#1021)

xingliu14 · web-flow · commit b156c314ac33 · 2025-11-07T16:25:09.000-08:00
Signed-off-by: Xing Liu &lt;xingliu14@gmail.com&gt;
diff --git a/tpu_inference/__init__.py b/tpu_inference/__init__.py
@@ -1,5 +1,9 @@
 import os
 
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import tpu_inference.env_override  # noqa: F401
 from tpu_inference import tpu_info as ti
 from tpu_inference.logger import init_logger
 
diff --git a/tpu_inference/env_override.py b/tpu_inference/env_override.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the tpu-inference project
+
+import os
+
+# Disable CUDA-specific shared experts stream for TPU
+# This prevents errors when trying to create CUDA streams on TPU hardware
+# The issue was introduced by vllm-project/vllm#26440
+os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"