wip

LucasWilkinson · LucasWilkinson · commit 2d2551bece97 · 2025-11-11T20:39:02.000-08:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -877,13 +877,17 @@ def custom_op_log_check(self):
                 )
 
     def adjust_cudagraph_sizes_to_be_multipe_of(self, multiple_of: int):
-        if not self.cudagraph_capture_sizes:
+        if not self.cudagraph_capture_sizes or multiple_of <= 1:
             return
 
+        assert self.max_cudagraph_capture_size is not None
+
         rounded_sizes = sorted(
-            round_up(size, multiple_of)
-            for size in self.cudagraph_capture_sizes
-            if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            set(
+                round_up(size, multiple_of)
+                for size in self.cudagraph_capture_sizes
+                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            )
         )
 
         if len(rounded_sizes) == 0:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -4240,6 +4240,8 @@ def _check_and_update_cudagraph_mode(
         # we need to adjust the cudagraph sizes to be a multiple of the uniform
         # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
         # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
+        # Will be removed in the near future when we have seperate cudagraph capture
+        # sizes for decode and mixed prefill-decode.
         if (
             cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
             and cudagraph_mode.separate_routine()
@@ -4248,6 +4250,8 @@ def _check_and_update_cudagraph_mode(
             self.compilation_config.adjust_cudagraph_sizes_to_be_multipe_of(
                 self.uniform_decode_query_len
             )
+            self.cudagraph_batch_sizes = self.compilation_config.cudagraph_capture_sizes
+
         self.compilation_config.compute_bs_to_padded_graph_size()
 
         # Trigger cudagraph dispatching keys initialization after