4949from vllm .model_executor .layers .quantization import QuantizationConfig
5050from vllm .model_executor .layers .rotary_embedding import get_rope
5151from vllm .model_executor .layers .vocab_parallel_embedding import (
52- DEFAULT_VOCAB_PADDING_SIZE ,
5352 ParallelLMHead ,
5453 VocabParallelEmbedding ,
5554)
@@ -346,24 +345,18 @@ def __init__(
346345 config = vllm_config .model_config .hf_config
347346 cache_config = vllm_config .cache_config
348347 quant_config = vllm_config .quant_config
349- lora_config = vllm_config .lora_config
350348
351349 self .config = config
352350 self .quant_config = quant_config
353- lora_vocab = (
354- (lora_config .lora_extra_vocab_size * (lora_config .max_loras or 1 ))
355- if lora_config
356- else 0
357- )
358- self .vocab_size = config .vocab_size + lora_vocab
359- self .org_vocab_size = config .vocab_size
351+
352+ self .vocab_size = config .vocab_size
353+
360354 if get_pp_group ().is_first_rank or (
361355 config .tie_word_embeddings and get_pp_group ().is_last_rank
362356 ):
363357 self .embed_tokens = VocabParallelEmbedding (
364358 self .vocab_size ,
365359 config .hidden_size ,
366- org_num_embeddings = config .vocab_size ,
367360 quant_config = quant_config ,
368361 )
369362 else :
@@ -518,9 +511,7 @@ def __init__(
518511 super ().__init__ ()
519512 config = vllm_config .model_config .hf_config
520513 quant_config = vllm_config .quant_config
521- lora_config = vllm_config .lora_config
522514 self .config = config
523- self .lora_config = lora_config
524515
525516 self .model = self ._init_model (
526517 vllm_config = vllm_config ,
@@ -529,20 +520,9 @@ def __init__(
529520 )
530521
531522 if get_pp_group ().is_last_rank :
532- self .unpadded_vocab_size = config .vocab_size
533- if lora_config :
534- self .unpadded_vocab_size += lora_config .lora_extra_vocab_size
535523 self .lm_head = ParallelLMHead (
536- self . unpadded_vocab_size ,
524+ config . vocab_size ,
537525 config .hidden_size ,
538- org_num_embeddings = config .vocab_size ,
539- padding_size = (
540- DEFAULT_VOCAB_PADDING_SIZE
541- # We need bigger padding if using lora for kernel
542- # compatibility
543- if not lora_config
544- else lora_config .lora_vocab_padding_size
545- ),
546526 quant_config = quant_config ,
547527 prefix = maybe_prefix (prefix , "lm_head" ),
548528 )
@@ -551,7 +531,7 @@ def __init__(
551531
552532 logit_scale = getattr (config , "logit_scale" , 1.0 )
553533 self .logits_processor = LogitsProcessor (
554- self . unpadded_vocab_size , config .vocab_size , logit_scale
534+ config .vocab_size , scale = logit_scale
555535 )
556536 else :
557537 self .lm_head = PPMissingLayer ()
0 commit comments