UX fix: hide warmup logs (#539)

adobrzyn · michalkuligowski · web-flow · commit 316349893bb7 · 2025-11-14T10:49:53.000+01:00
If VLLM_ENABLE_EXPERIMENTAL_FLAGS is set to 0 or not set warmup will
stay hidden with only progress bar.
Enabling this flag will bring back old logs

Additionally remove VLLM_USE_V1 flag

Additionally all user flags are no longer experimental

---------

Signed-off-by: Agata Dobrzyniewicz &lt;adobrzyniewicz@habana.ai&gt;
Co-authored-by: Michał Kuligowski &lt;michal.kuligowski@intel.com&gt;
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
@@ -23,7 +23,9 @@ This document lists the supported diagnostic and profiling, as well as performan
 | `VLLM_EXPONENTIAL_BUCKETING` | Enables exponential bucket spacing instead of linear spacing. | `true`        |
 | `VLLM_BUCKETING_FROM_FILE`   | Enables reading bucket configuration from file | `None`        |
 
-## Experimental Parameters
+## Developer Mode Parameters
+
+To enter developer mode use `VLLM_DEVELOPER_MODE`:
 
 | Parameter name     | Description              | Default value |
 | ------------------ | ------------------------ | ------------- |
diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py
@@ -12,7 +12,7 @@
 
 def get_user_flags():
     flags = [
-        Env('VLLM_ENABLE_EXPERIMENTAL_FLAGS', boolean),
+        Env('VLLM_DEVELOPER_MODE', boolean),
         Env('VLLM_EXPONENTIAL_BUCKETING', boolean),
         Env('VLLM_PROMPT_BS_BUCKET_MIN', int),
         Env('VLLM_PROMPT_BS_BUCKET_STEP', int),
diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py
@@ -63,18 +63,19 @@ def finalize_config():
 
     user_flags = filter_defined(detected, USER_FLAGS)
     experimental_flags = filter_defined(detected, EXPERIMENTAL_FLAGS)
+    experimental_flags = {k: v for k, v in user_flags.items() if k not in user_flags}
     environment_values = filter_defined(detected, ENVIRONMENT_VALUES)
     feature_values = filter_defined(detected, FEATURE_VALUES)
 
-    if len(experimental_flags) > 0 and not detected.VLLM_ENABLE_EXPERIMENTAL_FLAGS:
+    if len(experimental_flags) > 0 and not detected.VLLM_DEVELOPER_MODE:
         asterisks = 48 * '*'
         header = f"{asterisks} Warning! {asterisks}"
         footer = '*' * len(header)
         logger().warning(header)
         logger().warning(
             f"Following environment variables are considered experimental: {', '.join(experimental_flags)}")
         logger().warning(
-            "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.")
+            "From v0.12.0 release using those flags without VLLM_DEVELOPER_MODE will trigger a fatal error.")
         logger().warning(footer)
 
     dump('Environment', environment_values)
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -8,6 +8,7 @@
 import os
 import sys
 import time
+from tqdm import tqdm
 from dataclasses import dataclass, field, fields
 from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, cast)
 
@@ -3615,7 +3616,7 @@ def log_warmup(self, phase, i, max_i, first_dim, second_dim, third_dim, causal=F
                    f"query_len:{second_dim} "
                    f"num_blocks:{third_dim} "
                    f"free_mem:{free_mem}")
-        logger.info(msg)
+        tqdm.write(msg)
 
     def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, img_args):
         free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory())
@@ -3765,45 +3766,57 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat
         idx = 0
         num_candidates = len(buckets)
         captured_all = True
-        for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)):
-            if seq_len > self.max_num_tokens:
-                continue
-            # Graph memory usage is proportional to seq dimension in a batch
-            phase = f"Graph/{'prompt' if is_prompt else 'decode'}"
-            if is_prompt:
-                batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len
-            else:
-                batch_seq = batch_size
-
-            graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt)
-            if graphed_bucket in self.graphed_buckets:
-                continue
-            self.graphed_buckets.add(graphed_bucket)
-            self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks)
-            prompt_cfg, decode_cfg = None, None
-            with HabanaMemoryProfiler() as mem_prof:
+        developer_settings = get_config().VLLM_DEVELOPER_MODE
+        phase = 'Prompt' if is_prompt else 'Decode'
+        desc = f'{phase} warmup processing: '
+        with tqdm(total=num_candidates, desc=desc, unit="item") as pbar:
+            for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)):
+                if seq_len > self.max_num_tokens:
+                    continue
+                # Graph memory usage is proportional to seq dimension in a batch
                 if is_prompt:
-                    prompt_cfg = (batch_size, seq_len, num_blocks)
+                    batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len
                 else:
-                    decode_cfg = (batch_size, 1, num_blocks)
-                self._prepare_dummy_scenario(prompt_cfg, decode_cfg)
-            # TODO(kzawora): align_workers
-            used_mem = mem_prof.consumed_device_memory
-            total_mem += used_mem
-            total_batch_seq += batch_seq
+                    batch_seq = batch_size
+
+                graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt)
+                if graphed_bucket in self.graphed_buckets:
+                    continue
+                self.graphed_buckets.add(graphed_bucket)
+                if developer_settings:
+                    self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks)
+                prompt_cfg, decode_cfg = None, None
+                with HabanaMemoryProfiler() as mem_prof:
+                    if is_prompt:
+                        prompt_cfg = (batch_size, seq_len, num_blocks)
+                    else:
+                        decode_cfg = (batch_size, 1, num_blocks)
+                    self._prepare_dummy_scenario(prompt_cfg, decode_cfg)
+                # TODO(kzawora): align_workers
+                used_mem = mem_prof.consumed_device_memory
+                total_mem += used_mem
+                total_batch_seq += batch_seq
+
+                pbar.set_postfix_str(f"{idx}/{num_candidates}")
+                pbar.update(1)
 
         return total_mem, total_batch_seq, captured_all
 
     def warmup_unified_graphs(self, buckets, kv_cache):
         idx = 0
         num_candidates = len(buckets)
-        for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)):
-            unified_cfg = (query, shared_ctx, unique_ctx, is_causal)
-            if unified_cfg in self.graphed_buckets:
-                continue
-            self.graphed_buckets.add(unified_cfg)
-            self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal)
-            self._prepare_dummy_unified_scenario(unified_cfg)
+        developer_settings = get_config().VLLM_DEVELOPER_MODE
+        with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar:
+            for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)):
+                unified_cfg = (query, shared_ctx, unique_ctx, is_causal)
+                if unified_cfg in self.graphed_buckets:
+                    continue
+                self.graphed_buckets.add(unified_cfg)
+                if developer_settings:
+                    self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal)
+                self._prepare_dummy_unified_scenario(unified_cfg)
+                pbar.set_postfix_str(f"{idx}/{num_candidates}")
+                pbar.update(1)
 
     def _add_dummy_request(self,
                            requests,