Skip to content

Commit 3163498

Browse files
UX fix: hide warmup logs (#539)
If VLLM_ENABLE_EXPERIMENTAL_FLAGS is set to 0 or not set warmup will stay hidden with only progress bar. Enabling this flag will bring back old logs Additionally remove VLLM_USE_V1 flag Additionally all user flags are no longer experimental --------- Signed-off-by: Agata Dobrzyniewicz <adobrzyniewicz@habana.ai> Co-authored-by: Michał Kuligowski <michal.kuligowski@intel.com>
1 parent e3510f4 commit 3163498

File tree

4 files changed

+52
-36
lines changed

4 files changed

+52
-36
lines changed

docs/configuration/env_vars.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ This document lists the supported diagnostic and profiling, as well as performan
2323
| `VLLM_EXPONENTIAL_BUCKETING` | Enables exponential bucket spacing instead of linear spacing. | `true` |
2424
| `VLLM_BUCKETING_FROM_FILE` | Enables reading bucket configuration from file | `None` |
2525

26-
## Experimental Parameters
26+
## Developer Mode Parameters
27+
28+
To enter developer mode use `VLLM_DEVELOPER_MODE`:
2729

2830
| Parameter name | Description | Default value |
2931
| ------------------ | ------------------------ | ------------- |

vllm_gaudi/extension/features.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def get_user_flags():
1414
flags = [
15-
Env('VLLM_ENABLE_EXPERIMENTAL_FLAGS', boolean),
15+
Env('VLLM_DEVELOPER_MODE', boolean),
1616
Env('VLLM_EXPONENTIAL_BUCKETING', boolean),
1717
Env('VLLM_PROMPT_BS_BUCKET_MIN', int),
1818
Env('VLLM_PROMPT_BS_BUCKET_STEP', int),

vllm_gaudi/extension/runtime.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,18 +63,19 @@ def finalize_config():
6363

6464
user_flags = filter_defined(detected, USER_FLAGS)
6565
experimental_flags = filter_defined(detected, EXPERIMENTAL_FLAGS)
66+
experimental_flags = {k: v for k, v in user_flags.items() if k not in user_flags}
6667
environment_values = filter_defined(detected, ENVIRONMENT_VALUES)
6768
feature_values = filter_defined(detected, FEATURE_VALUES)
6869

69-
if len(experimental_flags) > 0 and not detected.VLLM_ENABLE_EXPERIMENTAL_FLAGS:
70+
if len(experimental_flags) > 0 and not detected.VLLM_DEVELOPER_MODE:
7071
asterisks = 48 * '*'
7172
header = f"{asterisks} Warning! {asterisks}"
7273
footer = '*' * len(header)
7374
logger().warning(header)
7475
logger().warning(
7576
f"Following environment variables are considered experimental: {', '.join(experimental_flags)}")
7677
logger().warning(
77-
"From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.")
78+
"From v0.12.0 release using those flags without VLLM_DEVELOPER_MODE will trigger a fatal error.")
7879
logger().warning(footer)
7980

8081
dump('Environment', environment_values)

vllm_gaudi/v1/worker/hpu_model_runner.py

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import os
99
import sys
1010
import time
11+
from tqdm import tqdm
1112
from dataclasses import dataclass, field, fields
1213
from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, cast)
1314

@@ -3615,7 +3616,7 @@ def log_warmup(self, phase, i, max_i, first_dim, second_dim, third_dim, causal=F
36153616
f"query_len:{second_dim} "
36163617
f"num_blocks:{third_dim} "
36173618
f"free_mem:{free_mem}")
3618-
logger.info(msg)
3619+
tqdm.write(msg)
36193620

36203621
def log_warmup_multimodal(self, phase, i, max_i, batch_size, seq_len, img_args):
36213622
free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory())
@@ -3765,45 +3766,57 @@ def warmup_graphs(self, buckets, is_prompt, kv_caches, starting_mem=0, total_bat
37653766
idx = 0
37663767
num_candidates = len(buckets)
37673768
captured_all = True
3768-
for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)):
3769-
if seq_len > self.max_num_tokens:
3770-
continue
3771-
# Graph memory usage is proportional to seq dimension in a batch
3772-
phase = f"Graph/{'prompt' if is_prompt else 'decode'}"
3773-
if is_prompt:
3774-
batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len
3775-
else:
3776-
batch_seq = batch_size
3777-
3778-
graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt)
3779-
if graphed_bucket in self.graphed_buckets:
3780-
continue
3781-
self.graphed_buckets.add(graphed_bucket)
3782-
self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks)
3783-
prompt_cfg, decode_cfg = None, None
3784-
with HabanaMemoryProfiler() as mem_prof:
3769+
developer_settings = get_config().VLLM_DEVELOPER_MODE
3770+
phase = 'Prompt' if is_prompt else 'Decode'
3771+
desc = f'{phase} warmup processing: '
3772+
with tqdm(total=num_candidates, desc=desc, unit="item") as pbar:
3773+
for idx, (batch_size, seq_len, num_blocks) in enumerate(reversed(buckets)):
3774+
if seq_len > self.max_num_tokens:
3775+
continue
3776+
# Graph memory usage is proportional to seq dimension in a batch
37853777
if is_prompt:
3786-
prompt_cfg = (batch_size, seq_len, num_blocks)
3778+
batch_seq = batch_size * seq_len * num_blocks if num_blocks else batch_size * seq_len
37873779
else:
3788-
decode_cfg = (batch_size, 1, num_blocks)
3789-
self._prepare_dummy_scenario(prompt_cfg, decode_cfg)
3790-
# TODO(kzawora): align_workers
3791-
used_mem = mem_prof.consumed_device_memory
3792-
total_mem += used_mem
3793-
total_batch_seq += batch_seq
3780+
batch_seq = batch_size
3781+
3782+
graphed_bucket = (batch_size, seq_len, num_blocks, is_prompt)
3783+
if graphed_bucket in self.graphed_buckets:
3784+
continue
3785+
self.graphed_buckets.add(graphed_bucket)
3786+
if developer_settings:
3787+
self.log_warmup(phase, idx, num_candidates, batch_size, seq_len, num_blocks)
3788+
prompt_cfg, decode_cfg = None, None
3789+
with HabanaMemoryProfiler() as mem_prof:
3790+
if is_prompt:
3791+
prompt_cfg = (batch_size, seq_len, num_blocks)
3792+
else:
3793+
decode_cfg = (batch_size, 1, num_blocks)
3794+
self._prepare_dummy_scenario(prompt_cfg, decode_cfg)
3795+
# TODO(kzawora): align_workers
3796+
used_mem = mem_prof.consumed_device_memory
3797+
total_mem += used_mem
3798+
total_batch_seq += batch_seq
3799+
3800+
pbar.set_postfix_str(f"{idx}/{num_candidates}")
3801+
pbar.update(1)
37943802

37953803
return total_mem, total_batch_seq, captured_all
37963804

37973805
def warmup_unified_graphs(self, buckets, kv_cache):
37983806
idx = 0
37993807
num_candidates = len(buckets)
3800-
for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)):
3801-
unified_cfg = (query, shared_ctx, unique_ctx, is_causal)
3802-
if unified_cfg in self.graphed_buckets:
3803-
continue
3804-
self.graphed_buckets.add(unified_cfg)
3805-
self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal)
3806-
self._prepare_dummy_unified_scenario(unified_cfg)
3808+
developer_settings = get_config().VLLM_DEVELOPER_MODE
3809+
with tqdm(total=num_candidates, desc="Unified Attention warmup", unit="item") as pbar:
3810+
for idx, (query, shared_ctx, unique_ctx, is_causal) in enumerate(reversed(buckets)):
3811+
unified_cfg = (query, shared_ctx, unique_ctx, is_causal)
3812+
if unified_cfg in self.graphed_buckets:
3813+
continue
3814+
self.graphed_buckets.add(unified_cfg)
3815+
if developer_settings:
3816+
self.log_warmup("Unified CFG", idx, num_candidates, query, shared_ctx, unique_ctx, is_causal)
3817+
self._prepare_dummy_unified_scenario(unified_cfg)
3818+
pbar.set_postfix_str(f"{idx}/{num_candidates}")
3819+
pbar.update(1)
38073820

38083821
def _add_dummy_request(self,
38093822
requests,

0 commit comments

Comments
 (0)