From 49c9056ea7d0755247b668548892aeae2d0bf73b Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 09:34:35 +0200 Subject: [PATCH 1/9] Init commit Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/features.py | 1 - vllm_gaudi/extension/runtime.py | 2 ++ vllm_gaudi/v1/worker/hpu_model_runner.py | 8 ++++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py index 9112ab235..de392d4a9 100644 --- a/vllm_gaudi/extension/features.py +++ b/vllm_gaudi/extension/features.py @@ -12,7 +12,6 @@ def get_user_flags(): flags = [ - Env('VLLM_USE_V1', boolean), Env('VLLM_ENABLE_EXPERIMENTAL_FLAGS', boolean), Env('VLLM_EXPONENTIAL_BUCKETING', boolean), Env('VLLM_PROMPT_BS_BUCKET_MIN', int), diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index 629a1bcb1..1f267bdce 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -63,6 +63,7 @@ def finalize_config(): user_flags = filter_defined(detected, USER_FLAGS) experimental_flags = filter_defined(detected, EXPERIMENTAL_FLAGS) + experimental_flags = [flag for flag in experimental_flags if flag not in user_flags] environment_values = filter_defined(detected, ENVIRONMENT_VALUES) feature_values = filter_defined(detected, FEATURE_VALUES) @@ -77,6 +78,7 @@ def finalize_config(): "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.") logger().warning(footer) + dump('Environment', environment_values) dump('Features', feature_values) dump('User flags', user_flags) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index d93aae129..00bb49580 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3743,11 +3743,14 @@ def warmup_defragmenter(self): logger.info("Defragmenter warmup completed successfully") def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_batch_seq=0.001): + from tqdm import tqdm + total_mem = starting_mem idx = 0 num_candidates = len(buckets) captured_all = True - for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): + developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS + for idx, (batch_size, seq_len, num_blocks) in tqdm(enumerate(reversed(buckets)), desc="Processing warmup"): if seq_len > self.max_num_tokens: continue # Graph memory usage is proportional to seq dimension in a batch @@ -3761,7 +3764,8 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat if graphed_bucket in self.graphed_buckets: continue self.graphed_buckets.add(graphed_bucket) - self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) + if developer_settings: + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) prompt_cfg, decode_cfg = None, None with HabanaMemoryProfiler() as mem_prof: if is_prompt: From 6c25155c128a656ca7c0557d2b81faac35011d38 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 15:43:21 +0200 Subject: [PATCH 2/9] Update progress bar Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/v1/worker/hpu_model_runner.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 00bb49580..95e052e7e 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3597,7 +3597,7 @@ def log_warmup(self, phase, i, max_i, first_dim, second_dim, third_dim, causal=F f"query_len:{second_dim} " f"num_blocks:{third_dim} " f"free_mem:{free_mem}") - logger.info(msg) + tqdm.write(msg) def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, img_args): free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory()) @@ -3750,7 +3750,9 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat num_candidates = len(buckets) captured_all = True developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS - for idx, (batch_size, seq_len, num_blocks) in tqdm(enumerate(reversed(buckets)), desc="Processing warmup"): + for idx, (batch_size, seq_len, num_blocks) in tqdm(enumerate(reversed(buckets)), + desc="Processing warmup", + unit="item"): if seq_len > self.max_num_tokens: continue # Graph memory usage is proportional to seq dimension in a batch @@ -3778,6 +3780,9 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat total_mem += used_mem total_batch_seq += batch_seq + pbar.set_postfix_str(f"{idx}/{num_candidates}") + pbar.update(1) + return total_mem, total_batch_seq, captured_all def warmup_unified_graphs(self, buckets, kv_cache): From 3873e74deed48574a3c65aa77ecb34be177ac092 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 15:52:17 +0200 Subject: [PATCH 3/9] Fix Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/v1/worker/hpu_model_runner.py | 62 ++++++++++++------------ 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 95e052e7e..d450df118 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -8,6 +8,7 @@ import os import sys import time +from tqdm import tqdm from dataclasses import dataclass, field, fields from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, cast) @@ -3743,45 +3744,42 @@ def warmup_defragmenter(self): logger.info("Defragmenter warmup completed successfully") def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_batch_seq=0.001): - from tqdm import tqdm - total_mem = starting_mem idx = 0 num_candidates = len(buckets) captured_all = True developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS - for idx, (batch_size, seq_len, num_blocks) in tqdm(enumerate(reversed(buckets)), - desc="Processing warmup", - unit="item"): - if seq_len > self.max_num_tokens: - continue - # Graph memory usage is proportional to seq dimension in a batch - phase = f"Graph/{'prompt' if is_prompt else 'decode'}" - if is_prompt: - batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len - else: - batch_seq = batch_size - - graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt) - if graphed_bucket in self.graphed_buckets: - continue - self.graphed_buckets.add(graphed_bucket) - if developer_settings: - self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) - prompt_cfg, decode_cfg = None, None - with HabanaMemoryProfiler() as mem_prof: + with tqdm(total=num_candidates, desc="Processing warmup", unit="item"): + for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): + if seq_len > self.max_num_tokens: + continue + # Graph memory usage is proportional to seq dimension in a batch + phase = f"Graph/{'prompt' if is_prompt else 'decode'}" if is_prompt: - prompt_cfg = (batch_size, seq_len, num_blocks) + batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len else: - decode_cfg = (batch_size, 1, num_blocks) - self._prepare_dummy_scenario(prompt_cfg, decode_cfg) - # TODO(kzawora): align_workers - used_mem = mem_prof.consumed_device_memory - total_mem += used_mem - total_batch_seq += batch_seq - - pbar.set_postfix_str(f"{idx}/{num_candidates}") - pbar.update(1) + batch_seq = batch_size + + graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt) + if graphed_bucket in self.graphed_buckets: + continue + self.graphed_buckets.add(graphed_bucket) + if developer_settings: + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) + prompt_cfg, decode_cfg = None, None + with HabanaMemoryProfiler() as mem_prof: + if is_prompt: + prompt_cfg = (batch_size, seq_len, num_blocks) + else: + decode_cfg = (batch_size, 1, num_blocks) + self._prepare_dummy_scenario(prompt_cfg, decode_cfg) + # TODO(kzawora): align_workers + used_mem = mem_prof.consumed_device_memory + total_mem += used_mem + total_batch_seq += batch_seq + + pbar.set_postfix_str(f"{idx}/{num_candidates}") + pbar.update(1) return total_mem, total_batch_seq, captured_all From 91b2bebe9af5db0b83644d6f0bba980d5e64a8d9 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 16:04:33 +0200 Subject: [PATCH 4/9] Fix and ua progress bar Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/v1/worker/hpu_model_runner.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index d450df118..770a88daf 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3749,12 +3749,13 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat num_candidates = len(buckets) captured_all = True developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS - with tqdm(total=num_candidates, desc="Processing warmup", unit="item"): + phase = {'Prompt' if is_prompt else 'Decode'} + desc = phase + " warmup processing: " + with tqdm(total=num_candidates, desc=desc, unit="item") as pbar: for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): if seq_len > self.max_num_tokens: continue # Graph memory usage is proportional to seq dimension in a batch - phase = f"Graph/{'prompt' if is_prompt else 'decode'}" if is_prompt: batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len else: @@ -3786,13 +3787,16 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat def warmup_unified_graphs(self, buckets, kv_cache): idx = 0 num_candidates = len(buckets) - for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): - unified_cfg = (query, shared_ctx, unique_ctx, is_causal) - if unified_cfg in self.graphed_buckets: - continue - self.graphed_buckets.add(unified_cfg) - self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) - self._prepare_dummy_unified_scenario(unified_cfg) + with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar: + for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): + unified_cfg = (query, shared_ctx, unique_ctx, is_causal) + if unified_cfg in self.graphed_buckets: + continue + self.graphed_buckets.add(unified_cfg) + self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) + self._prepare_dummy_unified_scenario(unified_cfg) + pbar.set_postfix_str(f"{idx}/{num_candidates}") + pbar.update(1) def _add_dummy_request(self, requests, From 0dea976de1f7947e1f959d12448a0236a647e924 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 16:08:35 +0200 Subject: [PATCH 5/9] Upsie lol Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 770a88daf..60cc801eb 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3749,8 +3749,8 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat num_candidates = len(buckets) captured_all = True developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS - phase = {'Prompt' if is_prompt else 'Decode'} - desc = phase + " warmup processing: " + phase = 'Prompt' if is_prompt else 'Decode' + desc = f'{phase} warmup processing: ' with tqdm(total=num_candidates, desc=desc, unit="item") as pbar: for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)): if seq_len > self.max_num_tokens: From 73c305e0b60f6d80d07d98993ecc1b1d14044ba0 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 16:37:53 +0200 Subject: [PATCH 6/9] Another one thank you Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/runtime.py | 2 +- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index 1f267bdce..e60233c54 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -63,7 +63,7 @@ def finalize_config(): user_flags = filter_defined(detected, USER_FLAGS) experimental_flags = filter_defined(detected, EXPERIMENTAL_FLAGS) - experimental_flags = [flag for flag in experimental_flags if flag not in user_flags] + experimental_flags = {k: v for k, v in user_flags.items() if k not in user_flags} environment_values = filter_defined(detected, ENVIRONMENT_VALUES) feature_values = filter_defined(detected, FEATURE_VALUES) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 60cc801eb..e781a12b3 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3787,13 +3787,15 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat def warmup_unified_graphs(self, buckets, kv_cache): idx = 0 num_candidates = len(buckets) + developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar: for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): unified_cfg = (query, shared_ctx, unique_ctx, is_causal) if unified_cfg in self.graphed_buckets: continue self.graphed_buckets.add(unified_cfg) - self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) + if developer_settings: + self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal) self._prepare_dummy_unified_scenario(unified_cfg) pbar.set_postfix_str(f"{idx}/{num_candidates}") pbar.update(1) From f2e3541248cdf6dbcd3bd76716442ba24c48dc8d Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Thu, 6 Nov 2025 16:56:19 +0200 Subject: [PATCH 7/9] precommit Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/runtime.py | 1 - vllm_gaudi/v1/worker/hpu_model_runner.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index e60233c54..7f56f1053 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -78,7 +78,6 @@ def finalize_config(): "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.") logger().warning(footer) - dump('Environment', environment_values) dump('Features', feature_values) dump('User flags', user_flags) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 45d26ed20..f0de8e9f3 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3768,7 +3768,7 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat continue self.graphed_buckets.add(graphed_bucket) if developer_settings: - self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks) prompt_cfg, decode_cfg = None, None with HabanaMemoryProfiler() as mem_prof: if is_prompt: From b3cf22ac6010c16a08b59e7adf6d04b83fdefb8b Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Fri, 14 Nov 2025 10:40:35 +0200 Subject: [PATCH 8/9] Change flag name Signed-off-by: Agata Dobrzyniewicz --- vllm_gaudi/extension/features.py | 2 +- vllm_gaudi/extension/runtime.py | 4 ++-- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py index fd6275524..d657fc279 100644 --- a/vllm_gaudi/extension/features.py +++ b/vllm_gaudi/extension/features.py @@ -12,7 +12,7 @@ def get_user_flags(): flags = [ - Env('VLLM_ENABLE_EXPERIMENTAL_FLAGS', boolean), + Env('VLLM_DEVELOPER_MODE', boolean), Env('VLLM_EXPONENTIAL_BUCKETING', boolean), Env('VLLM_PROMPT_BS_BUCKET_MIN', int), Env('VLLM_PROMPT_BS_BUCKET_STEP', int), diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py index 7f56f1053..f800843b5 100644 --- a/vllm_gaudi/extension/runtime.py +++ b/vllm_gaudi/extension/runtime.py @@ -67,7 +67,7 @@ def finalize_config(): environment_values = filter_defined(detected, ENVIRONMENT_VALUES) feature_values = filter_defined(detected, FEATURE_VALUES) - if len(experimental_flags) > 0 and not detected.VLLM_ENABLE_EXPERIMENTAL_FLAGS: + if len(experimental_flags) > 0 and not detected.VLLM_DEVELOPER_MODE: asterisks = 48 * '*' header = f"{asterisks} Warning! {asterisks}" footer = '*' * len(header) @@ -75,7 +75,7 @@ def finalize_config(): logger().warning( f"Following environment variables are considered experimental: {', '.join(experimental_flags)}") logger().warning( - "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.") + "From v0.12.0 release using those flags without VLLM_DEVELOPER_MODE will trigger a fatal error.") logger().warning(footer) dump('Environment', environment_values) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 11e5a6880..de51c390b 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -3761,7 +3761,7 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat idx = 0 num_candidates = len(buckets) captured_all = True - developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS + developer_settings = get_config().VLLM_DEVELOPER_MODE phase = 'Prompt' if is_prompt else 'Decode' desc = f'{phase} warmup processing: ' with tqdm(total=num_candidates, desc=desc, unit="item") as pbar: @@ -3800,7 +3800,7 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat def warmup_unified_graphs(self, buckets, kv_cache): idx = 0 num_candidates = len(buckets) - developer_settings = get_config().VLLM_ENABLE_EXPERIMENTAL_FLAGS + developer_settings = get_config().VLLM_DEVELOPER_MODE with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar: for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)): unified_cfg = (query, shared_ctx, unique_ctx, is_causal) From 06cae201b89485df05609ec5ff3b3856d5671d29 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Fri, 14 Nov 2025 10:43:44 +0200 Subject: [PATCH 9/9] readme Signed-off-by: Agata Dobrzyniewicz --- docs/configuration/env_vars.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 204f93215..286043987 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -23,7 +23,9 @@ This document lists the supported diagnostic and profiling, as well as performan | `VLLM_EXPONENTIAL_BUCKETING` | Enables exponential bucket spacing instead of linear spacing. | `true` | | `VLLM_BUCKETING_FROM_FILE` | Enables reading bucket configuration from file | `None` | -## Experimental Parameters +## Developer Mode Parameters + +To enter developer mode use `VLLM_DEVELOPER_MODE`: | Parameter name | Description | Default value | | ------------------ | ------------------------ | ------------- |