Skip to content

Commit 9d1c474

Browse files
authored
[LoRA][1/N]Remove LoRA extra vocab (#28382)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
1 parent 8c32c6e commit 9d1c474

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+197
-754
lines changed

vllm/model_executor/models/apertus.py

Lines changed: 5 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
from vllm.model_executor.layers.quantization import QuantizationConfig
5050
from vllm.model_executor.layers.rotary_embedding import get_rope
5151
from vllm.model_executor.layers.vocab_parallel_embedding import (
52-
DEFAULT_VOCAB_PADDING_SIZE,
5352
ParallelLMHead,
5453
VocabParallelEmbedding,
5554
)
@@ -346,24 +345,18 @@ def __init__(
346345
config = vllm_config.model_config.hf_config
347346
cache_config = vllm_config.cache_config
348347
quant_config = vllm_config.quant_config
349-
lora_config = vllm_config.lora_config
350348

351349
self.config = config
352350
self.quant_config = quant_config
353-
lora_vocab = (
354-
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
355-
if lora_config
356-
else 0
357-
)
358-
self.vocab_size = config.vocab_size + lora_vocab
359-
self.org_vocab_size = config.vocab_size
351+
352+
self.vocab_size = config.vocab_size
353+
360354
if get_pp_group().is_first_rank or (
361355
config.tie_word_embeddings and get_pp_group().is_last_rank
362356
):
363357
self.embed_tokens = VocabParallelEmbedding(
364358
self.vocab_size,
365359
config.hidden_size,
366-
org_num_embeddings=config.vocab_size,
367360
quant_config=quant_config,
368361
)
369362
else:
@@ -518,9 +511,7 @@ def __init__(
518511
super().__init__()
519512
config = vllm_config.model_config.hf_config
520513
quant_config = vllm_config.quant_config
521-
lora_config = vllm_config.lora_config
522514
self.config = config
523-
self.lora_config = lora_config
524515

525516
self.model = self._init_model(
526517
vllm_config=vllm_config,
@@ -529,20 +520,9 @@ def __init__(
529520
)
530521

531522
if get_pp_group().is_last_rank:
532-
self.unpadded_vocab_size = config.vocab_size
533-
if lora_config:
534-
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
535523
self.lm_head = ParallelLMHead(
536-
self.unpadded_vocab_size,
524+
config.vocab_size,
537525
config.hidden_size,
538-
org_num_embeddings=config.vocab_size,
539-
padding_size=(
540-
DEFAULT_VOCAB_PADDING_SIZE
541-
# We need bigger padding if using lora for kernel
542-
# compatibility
543-
if not lora_config
544-
else lora_config.lora_vocab_padding_size
545-
),
546526
quant_config=quant_config,
547527
prefix=maybe_prefix(prefix, "lm_head"),
548528
)
@@ -551,7 +531,7 @@ def __init__(
551531

552532
logit_scale = getattr(config, "logit_scale", 1.0)
553533
self.logits_processor = LogitsProcessor(
554-
self.unpadded_vocab_size, config.vocab_size, logit_scale
534+
config.vocab_size, scale=logit_scale
555535
)
556536
else:
557537
self.lm_head = PPMissingLayer()

vllm/model_executor/models/arcee.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
2424
from vllm.model_executor.layers.logits_processor import LogitsProcessor
2525
from vllm.model_executor.layers.vocab_parallel_embedding import (
26-
DEFAULT_VOCAB_PADDING_SIZE,
2726
ParallelLMHead,
2827
VocabParallelEmbedding,
2928
)
@@ -200,7 +199,6 @@ def __init__(
200199
self.quant_config = quant_config
201200
self.config = config
202201
self.vocab_size = config.vocab_size
203-
self.org_vocab_size = config.vocab_size
204202

205203
# Word embeddings (parallelized if using pipeline parallel)
206204
if get_pp_group().is_first_rank or (
@@ -209,7 +207,6 @@ def __init__(
209207
self.embed_tokens = VocabParallelEmbedding(
210208
self.vocab_size,
211209
config.hidden_size,
212-
org_num_embeddings=config.vocab_size,
213210
quant_config=quant_config,
214211
)
215212
else:
@@ -383,13 +380,10 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
383380
if get_pp_group().is_last_rank:
384381
# Determine vocabulary size (including any LoRA extra tokens
385382
# for padded LM head)
386-
self.unpadded_vocab_size = config.vocab_size
387383

388384
self.lm_head = ParallelLMHead(
389-
self.unpadded_vocab_size,
385+
config.vocab_size,
390386
config.hidden_size,
391-
org_num_embeddings=config.vocab_size,
392-
padding_size=DEFAULT_VOCAB_PADDING_SIZE,
393387
quant_config=vllm_config.quant_config,
394388
bias=getattr(config, "lm_head_bias", False),
395389
prefix=f"{prefix}.lm_head",
@@ -399,7 +393,7 @@ def __init__(self, *, vllm_config, prefix: str = "") -> None:
399393
self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
400394
logit_scale = getattr(config, "logit_scale", 1.0)
401395
self.logits_processor = LogitsProcessor(
402-
self.unpadded_vocab_size, config.vocab_size, logit_scale
396+
config.vocab_size, scale=logit_scale
403397
)
404398
else:
405399
# Placeholder for lm_head on non-last ranks

vllm/model_executor/models/arctic.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -490,10 +490,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
490490
self.lm_head.weight = self.model.embed_tokens.weight
491491
self.num_experts = config.num_local_experts
492492
self.num_experts_per_tok = config.num_experts_per_tok
493-
self.unpadded_vocab_size = config.vocab_size
494-
self.logits_processor = LogitsProcessor(
495-
self.unpadded_vocab_size, config.vocab_size
496-
)
493+
494+
self.logits_processor = LogitsProcessor(config.vocab_size)
497495
self.make_empty_intermediate_tensors = (
498496
self.model.make_empty_intermediate_tensors
499497
)

vllm/model_executor/models/aria.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -547,18 +547,14 @@ def __init__(
547547
self.pad_token_id = (
548548
self.config.pad_token_id if self.config.pad_token_id is not None else -1
549549
)
550-
self.unpadded_vocab_size = config.text_config.vocab_size
551550
self.lm_head = ParallelLMHead(
552-
self.unpadded_vocab_size,
551+
self.vocab_size,
553552
config.text_config.hidden_size,
554-
org_num_embeddings=self.language_model.org_vocab_size,
555553
quant_config=quant_config,
556554
prefix=maybe_prefix(prefix, "lm_head"),
557555
)
558556
logit_scale = getattr(config, "logit_scale", 1.0)
559-
self.logits_processor = LogitsProcessor(
560-
self.unpadded_vocab_size, self.vocab_size, logit_scale
561-
)
557+
self.logits_processor = LogitsProcessor(self.vocab_size, scale=logit_scale)
562558

563559
def _parse_and_validate_image_input(
564560
self, **kwargs: object

vllm/model_executor/models/baichuan.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,9 +402,9 @@ def __init__(
402402
super().__init__()
403403
config = vllm_config.model_config.hf_config
404404
quant_config = vllm_config.quant_config
405-
lora_config = vllm_config.lora_config
405+
406406
self.config = config
407-
self.lora_config = lora_config
407+
408408
self.tp_size = get_tensor_model_parallel_world_size()
409409
self.quant_config = quant_config
410410
self.model = BaiChuanModel(

vllm/model_executor/models/bailing_moe.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,10 +581,8 @@ def __init__(
581581
config = vllm_config.model_config.hf_config.get_text_config()
582582
vllm_config.model_config.hf_config = config
583583
quant_config = vllm_config.quant_config
584-
lora_config = vllm_config.lora_config
585584

586585
self.config = config
587-
self.lora_config = lora_config
588586
self.quant_config = quant_config
589587
self.max_position_embeddings = config.max_position_embeddings
590588
self.model = BailingMoeModel(

vllm/model_executor/models/bamba.py

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from vllm.model_executor.layers.quantization import QuantizationConfig
3131
from vllm.model_executor.layers.rotary_embedding import get_rope
3232
from vllm.model_executor.layers.vocab_parallel_embedding import (
33-
DEFAULT_VOCAB_PADDING_SIZE,
3433
ParallelLMHead,
3534
VocabParallelEmbedding,
3635
)
@@ -284,21 +283,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
284283
model_config = vllm_config.model_config
285284
cache_config = vllm_config.cache_config
286285
quant_config = vllm_config.quant_config
287-
lora_config = vllm_config.lora_config
288286

289287
self.config = config
290-
lora_vocab = (
291-
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
292-
if lora_config
293-
else 0
294-
)
295-
self.vocab_size = config.vocab_size + lora_vocab
296-
self.org_vocab_size = config.vocab_size
288+
289+
self.vocab_size = config.vocab_size
297290

298291
self.embed_tokens = VocabParallelEmbedding(
299292
self.vocab_size,
300293
config.hidden_size,
301-
org_num_embeddings=config.vocab_size,
302294
)
303295

304296
def get_layer(prefix: str):
@@ -478,7 +470,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
478470
config = vllm_config.model_config.hf_config
479471
self.vllm_config = vllm_config
480472
self.model_config = vllm_config.model_config
481-
lora_config = vllm_config.lora_config
473+
482474
scheduler_config = vllm_config.scheduler_config
483475
self.quant_config = vllm_config.quant_config
484476

@@ -488,24 +480,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
488480
self.model = BambaModel(
489481
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
490482
)
491-
self.unpadded_vocab_size = config.vocab_size
492-
if lora_config:
493-
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
483+
494484
self.lm_head = ParallelLMHead(
495-
self.unpadded_vocab_size,
485+
config.vocab_size,
496486
config.hidden_size,
497-
org_num_embeddings=config.vocab_size,
498-
padding_size=DEFAULT_VOCAB_PADDING_SIZE
499-
# We need bigger padding if using lora for kernel
500-
# compatibility
501-
if not lora_config
502-
else lora_config.lora_vocab_padding_size,
503487
prefix=maybe_prefix(prefix, "lm_head"),
504488
)
505489

506-
self.logits_processor = LogitsProcessor(
507-
self.unpadded_vocab_size, config.vocab_size
508-
)
490+
self.logits_processor = LogitsProcessor(config.vocab_size)
509491

510492
self.make_empty_intermediate_tensors = (
511493
self.model.make_empty_intermediate_tensors

vllm/model_executor/models/chameleon.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -963,19 +963,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
963963
self.model = ChameleonModel(
964964
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
965965
)
966-
self.unpadded_vocab_size = config.vocab_size
966+
967967
self.lm_head = ParallelLMHead(
968-
self.unpadded_vocab_size,
968+
config.vocab_size,
969969
config.hidden_size,
970970
prefix=maybe_prefix(prefix, "lm_head"),
971971
)
972972
if config.tie_word_embeddings:
973973
self.lm_head.weight = self.model.embed_tokens.weight
974974

975975
logit_scale = getattr(config, "logit_scale", 1.0)
976-
self.logits_processor = LogitsProcessor(
977-
self.unpadded_vocab_size, config.vocab_size, logit_scale
978-
)
976+
self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale)
979977
self.make_empty_intermediate_tensors = (
980978
self.model.make_empty_intermediate_tensors
981979
)

vllm/model_executor/models/chatglm.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,10 +433,9 @@ def __init__(
433433
super().__init__()
434434
config = vllm_config.model_config.hf_config
435435
quant_config = vllm_config.quant_config
436-
lora_config = vllm_config.lora_config
436+
437437
multimodal_config = vllm_config.model_config.multimodal_config
438438
self.config = config
439-
self.lora_config = lora_config
440439
self.multimodal_config = multimodal_config
441440

442441
self.quant_config = quant_config

vllm/model_executor/models/commandr.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -288,17 +288,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
288288
config = vllm_config.model_config.hf_config
289289
cache_config = vllm_config.cache_config
290290
quant_config = vllm_config.quant_config
291-
lora_config = vllm_config.lora_config
292291
self.quant_config = quant_config
293292

294293
self.config = config
295-
lora_vocab = (
296-
(lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
297-
if lora_config
298-
else 0
299-
)
300-
self.vocab_size = config.vocab_size + lora_vocab
301-
self.org_vocab_size = config.vocab_size
294+
295+
self.vocab_size = config.vocab_size
296+
302297
self.embed_tokens = VocabParallelEmbedding(
303298
config.vocab_size, config.hidden_size
304299
)
@@ -424,17 +419,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
424419
super().__init__()
425420
config = vllm_config.model_config.hf_config
426421
quant_config = vllm_config.quant_config
427-
lora_config = vllm_config.lora_config
422+
428423
self.config = config
429424
# currently all existing command R models have `tie_word_embeddings`
430425
# enabled
431426
assert config.tie_word_embeddings
432-
self.unpadded_vocab_size = config.vocab_size
433-
if lora_config:
434-
self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
427+
435428
self.quant_config = quant_config
436429
self.logits_processor = LogitsProcessor(
437-
self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale
430+
config.vocab_size, scale=config.logit_scale
438431
)
439432
self.model = CohereModel(
440433
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")

0 commit comments

Comments
 (0)