Add env to enable/disable aiter triton gemm

sarckk · sarckk · commit 20ac3b094897 · 2025-11-07T11:27:43.000-08:00
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -112,6 +112,7 @@
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
     VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -938,6 +939,12 @@ def get_vllm_port() -> int | None:
         os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
         in ("true", "1")
     ),
+    # Whether to use aiter triton kernels for gemm ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: (
+        os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower()
+        in ("true", "1")
+    ),
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (
         os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1")
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
@@ -106,6 +106,7 @@ def default_unquantized_gemm(
 def use_aiter_triton_gemm(n, m, k, dtype):
     if (
         envs.VLLM_ROCM_USE_AITER == 0
+        or envs.VLLM_ROCM_USE_AITER_TRITON_GEMM == 0
         # MI300's - fp8nuz=True
         or current_platform.is_fp8_fnuz()
         or dtype not in [torch.float16, torch.bfloat16]