[Disagg] Fixes for vllm model impl disagg support (#1066)

sixiang-google · web-flow · commit 31985320091a · 2025-11-11T09:51:43.000-08:00
diff --git a/tpu_inference/core/core_tpu.py b/tpu_inference/core/core_tpu.py
@@ -446,13 +446,15 @@ def __init__(
         executor_fail_callback: Optional[Callable] = None,
     ):
         self.vllm_config = vllm_config
-        self.vllm_config.cache_config.gpu_memory_utilization = (
-            self.vllm_config.cache_config.gpu_memory_utilization - 0.1)
 
         self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
                                               bytes]]()
 
         self.devices = jax.devices()
+        device_kind = self.devices[0].device_kind
+        if device_kind != 'TPU7x':
+            self.vllm_config.cache_config.gpu_memory_utilization = (
+                self.vllm_config.cache_config.gpu_memory_utilization - 0.1)
         prefill_slice_sizes, decode_slice_sizes, slice_sizes = _get_slice_sizes(
             self.devices)
 
@@ -597,7 +599,6 @@ def __init__(
         # engine core to be executed, instead we create other instance of
         # engine cores and let them do the work.
         self.vllm_config = vllm_config
-        self.vllm_config.cache_config.gpu_memory_utilization = self.vllm_config.cache_config.gpu_memory_utilization - 0.1
 
         # We should be taking the input from the client, the code below is forked from
         # vllm.v1.engine.core.EngineCoreProc.
@@ -610,6 +611,10 @@ def __init__(
         self.engines_running = False
 
         self.devices = jax.devices()
+        device_kind = self.devices[0].device_kind
+        if device_kind != 'TPU7x':
+            self.vllm_config.cache_config.gpu_memory_utilization = (
+                self.vllm_config.cache_config.gpu_memory_utilization - 0.1)
         prefill_slice_sizes, decode_slice_sizes, slice_sizes = _get_slice_sizes(
             self.devices)
 
diff --git a/tpu_inference/models/vllm/vllm_model_wrapper.py b/tpu_inference/models/vllm/vllm_model_wrapper.py
@@ -81,9 +81,21 @@ def __init__(self, vllm_config: VllmConfig, rng: PRNGKey, mesh: Mesh):
 
     def load_weights(self):
         # Set up to load the model into CPU first.
+        # Cache device slice config since device config cannot be deepcopied
+        modified_slice_config = False
+        if hasattr(
+                self.vllm_config.device_config,
+                'slice') and self.vllm_config.device_config.slice is not None:
+            slice_config = self.vllm_config.device_config.slice
+            modified_slice_config = True
+            self.vllm_config.device_config.slice = None
         vllm_config_for_load = copy.deepcopy(self.vllm_config)
+        if modified_slice_config:
+            self.vllm_config.device_config.slice = slice_config
         assert self.vllm_config.model_config.dtype in TORCH_DTYPE_TO_JAX, "The model_config.dtype must be a PyTorch dtype."
         vllm_config_for_load.device_config.device = "cpu"
+        # Clearing the cached compilation config, otherwise vllm model init will fail
+        vllm_config_for_load.compilation_config.static_forward_context.clear()
 
         # When expert parallelism is enabled, vLLM loads weight in sharding
         # aware manner. Since tpu-inference has its own sharding logic, this