Fix preemption handling (#524)

kzawora-intel · web-flow · commit 82085eb44312 · 2025-11-05T15:03:51.000+01:00
This PR fixes multitude of bugs we had in preemption handling:
- Fixed output token update of `CachedRequestState` - was updated twice
per iteration, resulting in doubled tokens - this broke preemption when
request was being re-added to input batch
- Batch preparation now uses input+output tokens in prefill for
preempted sequences (both non-unified and unified attention)
- Preempted sequences now get correctly recognized as prefills after
they exceed their original prefill length (e.g. prompt was 3 tokens,
generated 1024 before preemption - the sequence would get treated as
decode after first 3 tokens)
- Removed some incorrect assumptions about prefills (can have no
pre-existing output tokens)

Scenarios with preemptions yield proper accuracy, as can be tested with
very low `gpu_memory_utilization` and relatively high `max_num_seqs`:
```
 PT_HPU_LAZY_MODE=1 VLLM_SKIP_WARMUP=true lm_eval --model vllm --model_args pretrained=/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct/,enforce_eager=False,dtype=bfloat16,max_num_seqs=128,gpu_memory_utilization=0.05,max_model_len=4096,enable_prefix_caching=True,add_bos_token=false,tensor_parallel_size=1,max_gen_toks=2048 --tasks gsm8k_cot_llama --batch_size auto --trust_remote_code --apply_chat_template --fewshot_as_multiturn --num_fewshot 8

|     Tasks     |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|---------------|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k_cot_llama|      3|flexible-extract|     8|exact_match|↑  |0.8408|±  |0.0101|
|               |       |strict-match    |     8|exact_match|↑  |0.8415|±  |0.0101|
```

---------

Signed-off-by: Konrad Zawora &lt;kzawora@habana.ai&gt;
diff --git a/vllm_gaudi/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py
@@ -256,6 +256,15 @@ def add_request(
         start_idx = num_prompt_tokens
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids
+        #NOTE(kzawora): In non-preemption scenario,
+        # self.input_batch.num_prompt_tokens[batch_idx] == self.input_batch.num_tokens[batch_idx].
+        # In preemption scenario, we want num_prompt_tokens to also include the tokens emitted before preemption,
+        # as that is used as basis for recomputing prefill.
+        # This also assumes that preemption is complete and reduces num_computed_tokens to 0 and preempted sequences
+        # don't retain any originally used cache blocks.
+        if request.num_computed_tokens == 0:
+            self.num_prompt_tokens[req_index] = num_prompt_tokens + len(request.output_token_ids)
+
         # Number of token ids in token_ids_cpu.
         # NOTE(woosuk): This may include spec decode tokens.
         self.num_tokens[req_index] = request.num_tokens
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -1489,7 +1489,6 @@ def _get_prompts_and_decodes(
             num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
             num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
-
             if num_computed_tokens < num_prompt_tokens and \
                 not self.is_decoder_only(req_id):
                 # This is prompt
@@ -1518,11 +1517,7 @@ def _get_prompts_and_decodes(
 
             # Must be prompt
             assert num_computed_tokens < num_prompt_tokens
-            num_output_tokens = len(self.requests[req_id].output_token_ids)
-            if not has_kv_transfer_group():
-                #P case num_output_tokens has non 0
-                assert num_output_tokens == 0, \
-                    f'req_id: {req_id}, {num_output_tokens}'
+            # NOTE(kzawora): In preempted sequences, num_output_tokens can be > 0, and still be a valid prefill
 
             prompt_req_ids.append(req_id)
             prompt_scheduled_tokens.append(num_scheduled_tokens)
@@ -1678,26 +1673,29 @@ def _extract_prefill_batch_contents(self, num_prefills, num_decodes, num_schedul
 
         for batch_idx in range(num_decodes, num_reqs):
             req_id = self.input_batch.req_ids[batch_idx]
-            context_len = self.input_batch.num_computed_tokens_cpu[batch_idx]
-            query_len = num_scheduled_tokens[batch_idx]
+            seq_num_computed_tokens = self.input_batch.num_computed_tokens_cpu[batch_idx]
+            seq_num_scheduled_tokens = num_scheduled_tokens[batch_idx]
 
-            token_ids = self.input_batch.token_ids_cpu[batch_idx, context_len:context_len + query_len].tolist()
+            token_ids = self.input_batch.token_ids_cpu[batch_idx, seq_num_computed_tokens:seq_num_computed_tokens +
+                                                       seq_num_scheduled_tokens].tolist()
 
-            num_blocks = round_up(context_len + query_len, self.block_size) // self.block_size
+            num_blocks = round_up(seq_num_computed_tokens + seq_num_scheduled_tokens,
+                                  self.block_size) // self.block_size
             blocks = block_table_cpu_tensor[batch_idx, :num_blocks].tolist()
             if not warmup:
                 blocks = [self.defragmenter.resolve(b) for b in blocks]
-
-            prompt_tokens = self.input_batch.num_prompt_tokens[batch_idx]
-            # TODO: Fix non-prompt case
-            num_output_logits = max(0, context_len + query_len - prompt_tokens + 1)
-            logits_positions = list(range(query_len - num_output_logits, query_len))
+            #NOTE(kzawora): In non-preemption scenario,
+            # self.input_batch.num_prompt_tokens[batch_idx] == self.input_batch.num_tokens[batch_idx].
+            # In preemption scenario num_tokens will also include the tokens emitted before preemption
+            num_prompt_tokens = self.input_batch.num_prompt_tokens[batch_idx]
+            num_output_logits = max(0, seq_num_computed_tokens + seq_num_scheduled_tokens - num_prompt_tokens + 1)
+            logits_positions = list(range(seq_num_scheduled_tokens - num_output_logits, seq_num_scheduled_tokens))
 
             new_batch_contents = BatchContents(
                 req_ids=[req_id],
                 token_ids=[token_ids],
-                context_lens=[context_len],
-                prompt_lens=[prompt_tokens],
+                context_lens=[seq_num_computed_tokens],
+                prompt_lens=[num_prompt_tokens],
                 blocks=[blocks],
                 logits_positions=[logits_positions],
             )
@@ -3331,7 +3329,7 @@ def execute_model(
             num_tokens = len(token_ids)
             self.input_batch.token_ids_cpu[i, seq_len:seq_len + num_tokens] = token_ids
             self.input_batch.num_tokens[i] += len(token_ids)
-            req_state.output_token_ids.extend(token_ids)
+
         # NOTE(chendi): enable cache based on PR(#20291)
         # Cache the sampled tokens in the model runner, so that the scheduler
         # doesn't need to send them back.