neuralmagic · ProExpertProg · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -450,6 +450,7 @@ steps:
     - pytest -v -s compile/test_decorator.py
     - pytest -v -s compile/test_noop_elimination.py
     - pytest -v -s compile/test_aot_compile.py
+    - pytest -v -s compile/test_compile_ranges.py
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30

diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import fx as fx
+from torch import nn
+from torch.library import Library
+
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.decorators import support_torch_compile
+from vllm.compilation.inductor_pass import (
+    CustomGraphPass,
+    InductorPass,
+    get_pass_context,
+)
+from vllm.config import (
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.config.compilation import CompilationConfig, CompilationMode
+from vllm.config.scheduler import SchedulerConfig
+from vllm.forward_context import set_forward_context
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
+BATCH_SIZE = 64
+MLP_SIZE = 128
+
+
+@support_torch_compile
+class TestModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + x
+        attn_output = torch.empty_like(x)
+        torch.ops.silly.attention(x, x, x, attn_output)
+        x = attn_output
+        x = x * 3
+        return x
+
+
+@torch.inference_mode
+def run_model(vllm_config: VllmConfig, model: nn.Module, batch_sizes: list[int]):
+    with set_forward_context({}, vllm_config=vllm_config):
+        model(torch.randn(BATCH_SIZE, MLP_SIZE).cuda())
+        for batch_size in batch_sizes:
+            model(torch.randn(batch_size, MLP_SIZE).cuda())
+
+
+class PostGradPassManagerCheckRanges(CustomGraphPass):
+    def __init__(self, ranges: list[tuple[int, int]]):
+        self.ranges = ranges
+
+    def __call__(self, graph: fx.Graph):
+        compile_range = get_pass_context().compile_range
+        assert compile_range in self.ranges, (
+            f"Compile range {compile_range} not in {self.ranges}"
+        )
+
+    def uuid(self) -> str:
+        state = {
+            "ranges": self.ranges,
+        }
+        return InductorPass.hash_dict(state)
+
+
+def test_compile_ranges():
+    vllm_config = VllmConfig(
+        scheduler_config=SchedulerConfig(
+            max_num_batched_tokens=8192,
+        ),
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            compile_ranges_split_points=[8, 32],
+        ),
+        inductor_compile_config={
+            "post_grad_custom_post_pass": PostGradPassManagerCheckRanges(
+                [(1, 8), (8, 32), (32, 2049)]
+            )
+        },
+    )
+
+    with set_current_vllm_config(vllm_config):
+        model = TestModel(vllm_config=vllm_config, prefix="").eval().cuda()
+    batch_sizes = [1, 16, 48]
+    # A has support_torch_compile
+    with compilation_counter.expect(
+        num_graphs_seen=1,
+        num_piecewise_graphs_seen=1,
+        num_backend_compilations=3,
+        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+    ):
+        run_model(vllm_config, model, batch_sizes)
@@ -83,7 +83,7 @@ class CompilerManager:
     """
 
     def __init__(self, compilation_config: CompilationConfig):
-        self.cache: dict[tuple[int | None, int, str], Any] = dict()
+        self.cache: dict[tuple[tuple[int, int] | None, int, str], Any] = dict()
         self.is_cache_updated = False
         self.compilation_config = compilation_config
         self.compiler = make_compiler(compilation_config)
@@ -92,11 +92,11 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         return self.compiler.compute_hash(vllm_config)
 
     @contextmanager
-    def compile_context(self, runtime_shape: int | None = None):
+    def compile_context(self, compile_range: tuple[int, int] | None = None):
         """Provide compilation context for the duration of compilation to set
         any torch global properties we want to scope to a single Inductor
         compilation (e.g. partition rules, pass context)."""
-        with pass_context(runtime_shape):
+        with pass_context(compile_range):
             if self.compilation_config.use_inductor_graph_partition:
                 with inductor_partition_rule_context(
                     self.compilation_config.splitting_ops
@@ -152,26 +152,28 @@ def load(
         graph: fx.GraphModule,
         example_inputs: list[Any],
         graph_index: int,
-        runtime_shape: int | None = None,
+        compile_range: tuple[int, int] | None = None,
     ) -> Callable | None:
-        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+        if (compile_range, graph_index, self.compiler.name) not in self.cache:
             return None
-        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        handle = self.cache[(compile_range, graph_index, self.compiler.name)]
         compiled_graph = self.compiler.load(
-            handle, graph, example_inputs, graph_index, runtime_shape
+            handle, graph, example_inputs, graph_index, compile_range
         )
-        if runtime_shape is None:
+        if compile_range is None:
             logger.debug(
-                "Directly load the %s-th graph for dynamic shape from %s via handle %s",
+                "Directly load the %s-th graph for dynamic compile range"
+                "from %s via handle %s",
                 graph_index,
                 self.compiler.name,
                 handle,
             )
         else:
             logger.debug(
-                "Directly load the %s-th graph for shape %s from %s via handle %s",
+                "Directly load the %s-th graph for compile range %s"
+                "from %s via handle %s",
                 graph_index,
-                str(runtime_shape),
+                str(compile_range),
                 self.compiler.name,
                 handle,
             )
@@ -185,7 +187,7 @@ def compile(
         compilation_config: CompilationConfig,
         graph_index: int = 0,
         num_graphs: int = 1,
-        runtime_shape: int | None = None,
+        compile_range: tuple[int, int] | None = None,
     ) -> Any:
         if graph_index == 0:
             # before compiling the first graph, record the start time
@@ -197,25 +199,24 @@ def compile(
         compiled_graph = None
 
         # try to load from the cache
-        compiled_graph = self.load(graph, example_inputs, graph_index, runtime_shape)
+        compiled_graph = self.load(graph, example_inputs, graph_index, compile_range)
         if compiled_graph is not None:
             if graph_index == num_graphs - 1:
                 # after loading the last graph for this shape, record the time.
                 # there can be multiple graphs due to piecewise compilation.
                 now = time.time()
                 elapsed = now - compilation_start_time
-                compilation_config.compilation_time += elapsed
-                if runtime_shape is None:
+                if compile_range is None:
-                if compile_range is None:
+                compilation_config.compilation_time += elapsed
+                if compile_range is None:
-                if compile_range is None:
+                compilation_config.compilation_time += elapsed
+                if compile_range is None:
                     logger.info(
                         "Directly load the compiled graph(s) for dynamic shape "
                         "from the cache, took %.3f s",
                         elapsed,
                     )
                 else:
                     logger.info(
-                        "Directly load the compiled graph(s) for shape %s "
+                        "Directly load the compiled graph(s) for compile range %s "
                         "from the cache, took %.3f s",
-                        str(runtime_shape),
+                        str(compile_range),
                         elapsed,
                     )
             return compiled_graph
@@ -226,48 +227,52 @@ def compile(
             # Let compile_fx generate a key for us
             maybe_key = None
         else:
-            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
-
-        with self.compile_context(runtime_shape):
+            maybe_key = "artifact_compile_range_"
+            if compile_range is None:
+                maybe_key += "dynamic_shape"
+            else:
+                maybe_key += f"{compile_range[0]}_{compile_range[1]}"
+            maybe_key += f"_subgraph_{graph_index}"
+        with self.compile_context(compile_range):
             compiled_graph, handle = self.compiler.compile(
                 graph,
                 example_inputs,
                 additional_inductor_config,
-                runtime_shape,
+                compile_range,
                 maybe_key,
             )
 
         assert compiled_graph is not None, "Failed to compile the graph"
 
         # store the artifact in the cache
         if is_compile_cache_enabled(additional_inductor_config) and handle is not None:
-            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            self.cache[(compile_range, graph_index, self.compiler.name)] = handle
             compilation_counter.num_cache_entries_updated += 1
             self.is_cache_updated = True
             if graph_index == 0:
                 # adds some info logging for the first graph
-                if runtime_shape is None:
+                if compile_range is None:
                     logger.info_once(
                         "Cache the graph for dynamic shape for later use", scope="local"
                     )
                 else:
                     logger.info_once(
-                        "Cache the graph of shape %s for later use",
-                        str(runtime_shape),
-                        scope="local",
+                        "Cache the graph of compile range %s for later use",
+                        str(compile_range),
                     )
-            if runtime_shape is None:
+            if compile_range is None:
                 logger.debug(
-                    "Store the %s-th graph for dynamic shape from %s via handle %s",
+                    "Store the %s-th graph for dynamic compile range"
+                    "from %s via handle %s",
                     graph_index,
                     self.compiler.name,
                     handle,
                 )
             else:
                 logger.debug(
-                    "Store the %s-th graph for shape %s from %s via handle %s",
+                    "Store the %s-th graph for compile range%s from %s via handle %s",
                     graph_index,
-                    str(runtime_shape),
+                    str(compile_range),
                     self.compiler.name,
                     handle,
                 )
@@ -277,16 +282,16 @@ def compile(
             now = time.time()
             elapsed = now - compilation_start_time
             compilation_config.compilation_time += elapsed
-            if runtime_shape is None:
+            if compile_range is None:
                 logger.info_once(
-                    "Compiling a graph for dynamic shape takes %.2f s",
+                    "Compiling a graph for dynamic compile range takes %.2f s",
                     elapsed,
                     scope="local",
                 )
             else:
                 logger.info_once(
-                    "Compiling a graph for shape %s takes %.2f s",
-                    runtime_shape,
+                    "Compiling a graph for compile range %s takes %.2f s",
+                    str(compile_range),
                     elapsed,
                     scope="local",
                 )
@@ -409,19 +414,7 @@ def call_module(
             sym_shape_indices = [
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
-            global compilation_start_time
 
-            compiled_graph_for_dynamic_shape = (
-                self.vllm_backend.compiler_manager.compile(
-                    submod,
-                    args,
-                    self.compilation_config.inductor_compile_config,
-                    self.compilation_config,
-                    graph_index=index,
-                    num_graphs=len(self.compile_submod_names),
-                    runtime_shape=None,
-                )
-            )
             # Lazy import here to avoid circular import
             from .piecewise_backend import PiecewiseBackend
 
@@ -431,7 +424,6 @@ def call_module(
                 index,
                 len(self.compile_submod_names),
                 sym_shape_indices,
-                compiled_graph_for_dynamic_shape,
                 self.vllm_backend,
             )