From de86610be030d7e9dcd5aae04d0dc7196c761203 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Fri, 7 Nov 2025 11:26:13 -0800 Subject: [PATCH 1/3] Add env to enable/disable aiter triton gemm Signed-off-by: Yong Hoon Shin --- vllm/envs.py | 7 +++++++ vllm/model_executor/layers/utils.py | 1 + 2 files changed, 8 insertions(+) diff --git a/vllm/envs.py b/vllm/envs.py index 9cdb7ea974b8..05d88559b0d4 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -113,6 +113,7 @@ VLLM_ROCM_USE_AITER_FP8BMM: bool = True VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True + VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True VLLM_ROCM_USE_SKINNY_GEMM: bool = True VLLM_ROCM_FP8_PADDING: bool = True VLLM_ROCM_MOE_PADDING: bool = True @@ -944,6 +945,12 @@ def get_vllm_port() -> int | None: os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower() in ("true", "1") ), + # Whether to use aiter triton kernels for gemm ops. + # By default is enabled. + "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: ( + os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() + in ("true", "1") + ), # use rocm skinny gemms "VLLM_ROCM_USE_SKINNY_GEMM": lambda: ( os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in ("true", "1") diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 3d90c9513683..b17bdd0b7207 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -106,6 +106,7 @@ def default_unquantized_gemm( def use_aiter_triton_gemm(n, m, k, dtype): if ( envs.VLLM_ROCM_USE_AITER == 0 + or envs.VLLM_ROCM_USE_AITER_TRITON_GEMM == 0 # MI300's - fp8nuz=True or current_platform.is_fp8_fnuz() or dtype not in [torch.float16, torch.bfloat16] From 094f979b8b9002b60787eb73e7d2d35b42629fab Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Fri, 7 Nov 2025 11:57:20 -0800 Subject: [PATCH 2/3] Add to environment_variables_to_hash Signed-off-by: Yong Hoon Shin --- vllm/envs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/envs.py b/vllm/envs.py index 05d88559b0d4..a82cf7c6307a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1593,6 +1593,7 @@ def compute_hash() -> str: "VLLM_ROCM_USE_TRITON_ROPE", "VLLM_ROCM_USE_AITER_FP8BMM", "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION", + "VLLM_ROCM_USE_AITER_TRITON_GEMM", "VLLM_ROCM_USE_SKINNY_GEMM", "VLLM_ROCM_FP8_PADDING", "VLLM_ROCM_MOE_PADDING", From b5419b246c0a1cbdcfccbe0ebc73d29bc20678a6 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin Date: Fri, 7 Nov 2025 12:24:50 -0800 Subject: [PATCH 3/3] Fix formatting Signed-off-by: Yong Hoon Shin --- vllm/envs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index a82cf7c6307a..078e5c38f0f4 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -948,8 +948,7 @@ def get_vllm_port() -> int | None: # Whether to use aiter triton kernels for gemm ops. # By default is enabled. "VLLM_ROCM_USE_AITER_TRITON_GEMM": lambda: ( - os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() - in ("true", "1") + os.getenv("VLLM_ROCM_USE_AITER_TRITON_GEMM", "True").lower() in ("true", "1") ), # use rocm skinny gemms "VLLM_ROCM_USE_SKINNY_GEMM": lambda: (