From 26f1b1edc96a89ba3f09eb0ac1b93d0f0c4d314b Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad Date: Sat, 8 Nov 2025 17:59:51 +0800 Subject: [PATCH 1/2] Revert gpt-oss max cudagraph size to 1024 Signed-off-by: Mohammad Miadh Angkad --- vllm/model_executor/models/config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 33fa06fe0e9b..569c896af4e5 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -258,9 +258,9 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: if structured_outputs_config.reasoning_parser == "": structured_outputs_config.reasoning_parser = "openai_gptoss" - # Increase the max capture size from 512 to 992 for performance. + # Increase the max capture size from 512 to 1024 for performance. # NOTE(woosuk): This will increase the number of CUDA graphs - # from 67 to 81. + # from 67 to 83. compilation_config = vllm_config.compilation_config # Only override when the user has not set either of # cudagraph_capture_sizes or max_cudagraph_capture_size. @@ -269,10 +269,10 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: and compilation_config.max_cudagraph_capture_size is None ): # FIXME(woosuk): When using full cuda graph with FA3, the max - # supported size is 992. - compilation_config.max_cudagraph_capture_size = 992 + # supported size is 1024. + compilation_config.max_cudagraph_capture_size = 1024 logger.info( - "Overriding max cuda graph capture size to %d for performance.", 992 + "Overriding max cuda graph capture size to %d for performance.", 1024 ) From 4489bae1ac3c780b92727c14297b4e91c79265cb Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad Date: Sat, 8 Nov 2025 18:24:51 +0800 Subject: [PATCH 2/2] Remove comment about max supported size Signed-off-by: Mohammad Miadh Angkad --- vllm/model_executor/models/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 569c896af4e5..66b246878b0a 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -268,8 +268,6 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: compilation_config.cudagraph_capture_sizes is None and compilation_config.max_cudagraph_capture_size is None ): - # FIXME(woosuk): When using full cuda graph with FA3, the max - # supported size is 1024. compilation_config.max_cudagraph_capture_size = 1024 logger.info( "Overriding max cuda graph capture size to %d for performance.", 1024