Send kv events from worker side to scheduler side

hickeyma · hickeyma · commit f2241f4df0b0 · 2025-11-17T10:40:25.000Z
This is required for when worker side operations like CPU offloading
generate KV cache events. This commit enables theses events to be passed
to the scheduler side so that they can be published by the engine.

Signed-off-by: Martin Hickey &lt;martin.hickey@ie.ibm.com&gt;
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -49,7 +49,7 @@
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
-    from vllm.distributed.kv_events import KVCacheEvent
+    from vllm.distributed.kv_events import KVCacheEvent, KVEventBatch
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
         KVConnectorPromMetrics,
         KVConnectorStats,
@@ -350,6 +350,12 @@ def get_kv_connector_stats(self) -> Optional["KVConnectorStats"]:
         """
         return None
 
+    def get_kv_connector_kv_cache_events(self) -> Optional["KVEventBatch"]:
+        """
+        Get the KV connector kv cache events collected during the last interval.
+        """
+        return None
+
     def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None:
         """
         Get the KVConnector handshake metadata for this connector.
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -1,20 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Any
+import time
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from lmcache.integration.vllm.vllm_v1_adapter import (
     LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )
 
 from vllm.config import VllmConfig
+from vllm.distributed.kv_events import BlockStored, KVCacheEvent, KVEventBatch
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
     KVConnectorMetadata,
     KVConnectorRole,
 )
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
@@ -54,6 +58,8 @@ def __init__(
 
         self._lmcache_engine = cls(vllm_config, role, self)
 
+        self._kv_events: list[KVCacheEvent] = []
+
     # ==============================
     # Worker-side methods
     # ==============================
@@ -151,6 +157,30 @@ def get_block_ids_with_load_errors(self) -> set[int]:
         # Fallback for older versions that don't support this method
         return set()
 
+    def get_kv_connector_kv_cache_events(self) -> Optional["KVEventBatch"]:
+        """
+        Get the KV connector kv cache events collected during the last interval.
+        """
+        events = self._lmcache_engine.get_kv_events()
+        if not events:
+            return None
+
+        lmcache_kv_events: KVEventBatch | None = None
+        for event in events:
+            if lmcache_kv_events is None:
+                lmcache_kv_events = KVEventBatch(ts=time.time(), events=[])
+            block = BlockStored(
+                block_hashes=event.block_hashes,
+                parent_block_hash=event.parent_block_hash,
+                token_ids=event.token_ids,
+                lora_id=event.lora_id,
+                block_size=event.block_size,
+                medium=event.medium,
+            )
+            lmcache_kv_events.events.append(block)
+
+        return lmcache_kv_events
+
     # ==============================
     # Scheduler-side methods
     # ==============================
@@ -198,6 +228,25 @@ def build_connector_meta(
         """
         return self._lmcache_engine.build_connector_meta(scheduler_output)
 
+    def update_connector_output(self, connector_output: KVConnectorOutput):
+        """
+        Update KVConnector state from worker-side connectors output.
+
+        Args:
+            connector_output (KVConnectorOutput): the worker-side
+                connectors output.
+        """
+        # Get the KV events
+        kv_events = connector_output.kv_cache_events
+        if (
+            not kv_events
+            or not isinstance(kv_events, KVEventBatch)
+            or not kv_events.events
+        ):
+            return
+        self._kv_events = kv_events.events
+        return
+
     def request_finished(
         self,
         request: "Request",
@@ -214,3 +263,14 @@ def request_finished(
             returned by the engine.
         """
         return self._lmcache_engine.request_finished(request, block_ids)
+
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        if self._kv_events is not None:
+            yield from self._kv_events
+            self._kv_events.clear()
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
@@ -11,9 +11,11 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
+    from vllm.distributed.kv_events import KVEventBatch
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 else:
     KVConnectorStats = object
+    KVEventBatch = object
 
 
 class LogprobsLists(NamedTuple):
@@ -119,6 +121,7 @@ class KVConnectorOutput:
     finished_sending: set[str] | None = None
     finished_recving: set[str] | None = None
     kv_connector_stats: KVConnectorStats | None = None
+    kv_cache_events: KVEventBatch | None = None
     # IDs of externally computed KV blocks that failed to load.
     # Requests referencing these blocks should be rescheduled to recompute them
     invalid_block_ids: set[int] = field(default_factory=set)
@@ -134,6 +137,7 @@ def is_empty(self):
             not self.finished_sending
             and not self.finished_recving
             and not self.kv_connector_stats
+            and not self.kv_cache_events
             and not self.invalid_block_ids
         )
 
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -135,10 +135,19 @@ def _get_kv_connector_output(
             output.kv_connector_stats = (
                 KVConnectorModelRunnerMixin.get_kv_connector_stats()
             )
+            output.kv_cache_events = (
+                KVConnectorModelRunnerMixin.get_kv_connector_kv_cache_events()
+            )
             kv_connector.clear_connector_metadata()
 
     @staticmethod
     def get_kv_connector_stats() -> KVConnectorStats | None:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
+
+    @staticmethod
+    def get_kv_connector_kv_cache_events() -> KVConnectorStats | None:
+        if has_kv_transfer_group():
+            return get_kv_transfer_group().get_kv_connector_kv_cache_events()
+        return None