Skip to content

Commit 94e98c2

Browse files
committed
convert_hf_to_gguf for Kimi-K2-Instruct
Adapt mainline `PR14653` for tokenizer while maintaining proper MLA tensors. Tested with this workflow using deepseek fp8_cast_bf16.py and triton-cpu to upcast the fp8 safetensors to bf16 safetensors then used this convert_hf_to_gguf.
1 parent 45fae1a commit 94e98c2

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
639639
if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
640640
# ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
641641
res = "seed-coder"
642+
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
643+
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
644+
res = "kimi-k2"
642645

643646
if res is None:
644647
logger.warning("\n")
@@ -3379,6 +3382,60 @@ class DeepseekV2Model(Model):
33793382
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
33803383

33813384
def set_vocab(self):
3385+
3386+
if self.hparams["vocab_size"] == 163840: # Kimi-K2 model
3387+
from transformers import AutoTokenizer
3388+
3389+
tokenizer = AutoTokenizer.from_pretrained(
3390+
self.dir_model, trust_remote_code=True
3391+
)
3392+
tokpre = self.get_vocab_base_pre(tokenizer)
3393+
3394+
# Build merges list using the approach similar to HunYuanMoE
3395+
merges = []
3396+
vocab = {}
3397+
mergeable_ranks = tokenizer.model._mergeable_ranks
3398+
for token, rank in mergeable_ranks.items():
3399+
vocab[QwenModel.token_bytes_to_string(token)] = rank
3400+
if len(token) == 1:
3401+
continue
3402+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
3403+
if len(merged) == 2:
3404+
merges.append(
3405+
" ".join(map(QwenModel.token_bytes_to_string, merged))
3406+
)
3407+
3408+
# Build token list
3409+
vocab_size = self.hparams["vocab_size"]
3410+
special_tokens = tokenizer.special_tokens
3411+
reverse_vocab = {
3412+
id_: encoded_tok
3413+
for encoded_tok, id_ in {**vocab, **special_tokens}.items()
3414+
}
3415+
tokens: list[str] = []
3416+
toktypes: list[int] = []
3417+
3418+
for i in range(vocab_size):
3419+
if i not in reverse_vocab:
3420+
tokens.append(f"[PAD{i}]")
3421+
toktypes.append(gguf.TokenType.UNUSED)
3422+
else:
3423+
token = reverse_vocab[i]
3424+
tokens.append(token)
3425+
if i in special_tokens.values():
3426+
toktypes.append(gguf.TokenType.CONTROL)
3427+
else:
3428+
toktypes.append(gguf.TokenType.NORMAL)
3429+
3430+
self.gguf_writer.add_tokenizer_model("gpt2")
3431+
self.gguf_writer.add_tokenizer_pre(tokpre)
3432+
self.gguf_writer.add_token_list(tokens)
3433+
self.gguf_writer.add_token_types(toktypes)
3434+
self.gguf_writer.add_token_merges(merges)
3435+
3436+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
3437+
special_vocab.add_to_gguf(self.gguf_writer)
3438+
else:
33823439
self._set_vocab_gpt2()
33833440

33843441
def set_gguf_parameters(self):

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ class TOKENIZER_TYPE(IntEnum):
9696
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
9797
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
9898
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
99+
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", },
99100
]
100101

101102

0 commit comments

Comments
 (0)