Skip to content

Commit a2c77c6

Browse files
Port: add VLLM_DISABLE_MARK_SCALES_AS_CONST (#522)
add VLLM_DISABLE_MARK_SCALES_AS_CONST to avoid too much graph compile. V0 code base :https://github.com/HabanaAI/vllm-fork/blob/habana_main/vllm/worker/hpu_model_runner.py#L1228-L1230 Signed-off-by: zhenzhao <zhenzhao@habana.ai> Co-authored-by: Michał Kuligowski <michal.kuligowski@intel.com>
1 parent f7050a9 commit a2c77c6

File tree

1 file changed

+3
-1
lines changed

1 file changed

+3
-1
lines changed

vllm_gaudi/v1/worker/hpu_model_runner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3429,6 +3429,7 @@ def load_model(self) -> None:
34293429
with HabanaMemoryProfiler() as m_inc:
34303430
from neural_compressor.torch.quantization import (FP8Config, convert, prepare)
34313431
config = FP8Config.from_json_file(os.getenv("QUANT_CONFIG", ""))
3432+
disable_mark_scales_as_const = os.getenv("VLLM_DISABLE_MARK_SCALES_AS_CONST", "false") in ("1", "true")
34323433
self._inc_preprocess()
34333434
if config.measure:
34343435
self.model = prepare(self.model, config)
@@ -3437,7 +3438,8 @@ def load_model(self) -> None:
34373438
else:
34383439
raise ValueError("Unknown quantization config mode,"
34393440
"please validate quantization config file")
3440-
htcore.hpu_initialize(self.model, mark_only_scales_as_const=True)
3441+
if not disable_mark_scales_as_const:
3442+
htcore.hpu_initialize(self.model, mark_only_scales_as_const=True)
34413443
self.inc_initialized_successfully = True
34423444
self.model_memory_usage = m_inc.consumed_device_memory
34433445
logger.info("Preparing model with INC took %.4f GB", self.model_memory_usage / float(2**30))

0 commit comments

Comments
 (0)