Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 21 additions & 17 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2791,31 +2791,35 @@
rank_mapping,
)

self.model = self._compile_model(self.model)
if hasattr(self, "drafter"):
self.drafter.model = self._compile_model(self.drafter.model)

Check failure on line 2796 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

"NgramProposer" has no attribute "model" [attr-defined]

Check failure on line 2796 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

"NgramProposer" has no attribute "model" [attr-defined]

Check failure on line 2796 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

"NgramProposer" has no attribute "model" [attr-defined]

Check failure on line 2796 in vllm/v1/worker/gpu_model_runner.py

View workflow job for this annotation

GitHub Actions / pre-commit

"NgramProposer" has no attribute "model" [attr-defined]

def _compile_model(self, model: nn.Module) -> nn.Module:
if (
self.vllm_config.compilation_config.level == \
self.compilation_config.level == \
CompilationLevel.DYNAMO_AS_IS and supports_dynamo()
):
backend = self.vllm_config.compilation_config.init_backend(
self.vllm_config)
backend = self.compilation_config.init_backend(self.vllm_config)
compilation_counter.dynamo_as_is_count += 1
self.model.compile(fullgraph=True, backend=backend)
return
model.compile(fullgraph=True, backend=backend)
return model
# for other compilation levels, cudagraph behavior is controlled by
# CudagraphWraper and CudagraphDispatcher of vllm.

# wrap the model with full cudagraph wrapper if needed.
if self.compilation_config.cudagraph_mode.has_full_cudagraphs() \
and not self.parallel_config.enable_dbo:
self.model = CUDAGraphWrapper(self.model,
self.vllm_config,
runtime_mode=CUDAGraphMode.FULL)
elif self.parallel_config.enable_dbo:
if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
self.model = UBatchWrapper(self.model, self.vllm_config,
CUDAGraphMode.FULL, self.device)
else:
self.model = UBatchWrapper(self.model, self.vllm_config,
CUDAGraphMode.NONE, self.device)
full = self.compilation_config.cudagraph_mode.has_full_cudagraphs()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this happen to enable full cuda graphs for the draft model? If so, how would that work?

I believe we currently force piecewise for the drafter, but I still don't fully understand how many obstacles remain to unblock full-graphs for the draft model. See here for more context #23679.

dual_batch_overlap = self.parallel_config.enable_dbo
if full and not dual_batch_overlap:
return CUDAGraphWrapper(model, self.vllm_config,
CUDAGraphMode.FULL)
elif dual_batch_overlap:
mode = CUDAGraphMode.FULL if full else CUDAGraphMode.NONE
return UBatchWrapper(model, self.vllm_config, mode, self.device)

logger.info(
"Incompatible compilation config - skipping model compilation")
return model

def reload_weights(self) -> None:
assert getattr(self, "model", None) is not None, \
Expand Down
Loading