@@ -639,6 +639,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
639639 if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec" :
640640 # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
641641 res = "seed-coder"
642+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
643+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
644+ res = "kimi-k2"
642645
643646 if res is None :
644647 logger .warning ("\n " )
@@ -3379,6 +3382,60 @@ class DeepseekV2Model(Model):
33793382 model_arch = gguf .MODEL_ARCH .DEEPSEEK2
33803383
33813384 def set_vocab (self ):
3385+
3386+ if self .hparams ["vocab_size" ] == 163840 : # Kimi-K2 model
3387+ from transformers import AutoTokenizer
3388+
3389+ tokenizer = AutoTokenizer .from_pretrained (
3390+ self .dir_model , trust_remote_code = True
3391+ )
3392+ tokpre = self .get_vocab_base_pre (tokenizer )
3393+
3394+ # Build merges list using the approach similar to HunYuanMoE
3395+ merges = []
3396+ vocab = {}
3397+ mergeable_ranks = tokenizer .model ._mergeable_ranks
3398+ for token , rank in mergeable_ranks .items ():
3399+ vocab [QwenModel .token_bytes_to_string (token )] = rank
3400+ if len (token ) == 1 :
3401+ continue
3402+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
3403+ if len (merged ) == 2 :
3404+ merges .append (
3405+ " " .join (map (QwenModel .token_bytes_to_string , merged ))
3406+ )
3407+
3408+ # Build token list
3409+ vocab_size = self .hparams ["vocab_size" ]
3410+ special_tokens = tokenizer .special_tokens
3411+ reverse_vocab = {
3412+ id_ : encoded_tok
3413+ for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()
3414+ }
3415+ tokens : list [str ] = []
3416+ toktypes : list [int ] = []
3417+
3418+ for i in range (vocab_size ):
3419+ if i not in reverse_vocab :
3420+ tokens .append (f"[PAD{ i } ]" )
3421+ toktypes .append (gguf .TokenType .UNUSED )
3422+ else :
3423+ token = reverse_vocab [i ]
3424+ tokens .append (token )
3425+ if i in special_tokens .values ():
3426+ toktypes .append (gguf .TokenType .CONTROL )
3427+ else :
3428+ toktypes .append (gguf .TokenType .NORMAL )
3429+
3430+ self .gguf_writer .add_tokenizer_model ("gpt2" )
3431+ self .gguf_writer .add_tokenizer_pre (tokpre )
3432+ self .gguf_writer .add_token_list (tokens )
3433+ self .gguf_writer .add_token_types (toktypes )
3434+ self .gguf_writer .add_token_merges (merges )
3435+
3436+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
3437+ special_vocab .add_to_gguf (self .gguf_writer )
3438+ else :
33823439 self ._set_vocab_gpt2 ()
33833440
33843441 def set_gguf_parameters (self ):
0 commit comments