Skip to content

Commit 76782e1

Browse files
author
liyang
committed
Refactor JinaCLIP vision mmproj mapping to use tensor_mapping table
1 parent 6d7cc17 commit 76782e1

File tree

3 files changed

+51
-96
lines changed

3 files changed

+51
-96
lines changed

convert_hf_to_gguf.py

Lines changed: 11 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,7 +1531,9 @@ class MmprojModel(ModelBase):
15311531
preprocessor_config: dict[str, Any]
15321532
global_config: dict[str, Any]
15331533

1534-
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1534+
# Prefer explicit "layers" (e.g. JinaCLIP),
1535+
# keep legacy keys for other models.
1536+
n_block_keys = ["layers", "n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
15351537

15361538
has_vision_encoder: bool = True # by default
15371539
has_audio_encoder: bool = False
@@ -6805,6 +6807,11 @@ def __init__(self, *args, **kwargs):
68056807
with open(config_path, encoding="utf-8") as f:
68066808
self.vision_config = json.load(f)
68076809

6810+
def get_vision_config(self) -> dict[str, Any] | None:
6811+
# For JinaCLIPVisionModel, the top-level AutoConfig dict is already
6812+
# the vision-only configuration.
6813+
return self.global_config
6814+
68086815
def set_vocab(self):
68096816
# Vision encoder doesn't need vocabulary
68106817
pass
@@ -6862,73 +6869,10 @@ def set_gguf_parameters(self):
68626869
def _strip_vm_prefix(self, name: str) -> str:
68636870
return name[len('vision_model.'):] if name.startswith('vision_model.') else name
68646871

6865-
def _map_block_tensor(self, layer: int, rest: str, data_torch: Tensor, name: str) -> list[tuple[str, Tensor]] | None:
6866-
parts = rest.split('.')
6867-
# layer norms
6868-
if rest.startswith('norm1.'):
6869-
suffix = parts[-1]
6870-
return [(f'v.blk.{layer}.ln1.{suffix}', data_torch)]
6871-
if rest.startswith('norm2.'):
6872-
suffix = parts[-1]
6873-
return [(f'v.blk.{layer}.ln2.{suffix}', data_torch)]
6874-
if rest.startswith('attn.inner_attn_ln.'):
6875-
suffix = parts[-1]
6876-
return [(f'v.blk.{layer}.attn_ln.{suffix}', data_torch)]
6877-
6878-
if rest == 'attn.q_bias':
6879-
return [(f'v.blk.{layer}.attn_q.bias', data_torch)]
6880-
if rest == 'attn.v_bias':
6881-
return [(f'v.blk.{layer}.attn_v.bias', data_torch)]
6882-
6883-
if rest.startswith('attn.q_proj.'):
6884-
suffix = parts[-1]
6885-
return [(f'v.blk.{layer}.attn_q.{suffix}', data_torch)]
6886-
if rest.startswith('attn.k_proj.'):
6887-
suffix = parts[-1]
6888-
return [(f'v.blk.{layer}.attn_k.{suffix}', data_torch)]
6889-
if rest.startswith('attn.v_proj.'):
6890-
suffix = parts[-1]
6891-
return [(f'v.blk.{layer}.attn_v.{suffix}', data_torch)]
6892-
if rest.startswith('attn.proj.'):
6893-
suffix = parts[-1]
6894-
return [(f'v.blk.{layer}.attn_out.{suffix}', data_torch)]
6895-
6896-
# MLP
6897-
if rest.startswith('mlp.w1.'):
6898-
suffix = parts[-1]
6899-
return [(f'v.blk.{layer}.ffn_gate.{suffix}', data_torch)]
6900-
if rest.startswith('mlp.w2.'):
6901-
suffix = parts[-1]
6902-
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
6903-
if rest.startswith('mlp.w3.'):
6904-
suffix = parts[-1]
6905-
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
6906-
if rest.startswith('mlp.ffn_ln.'):
6907-
suffix = parts[-1]
6908-
return [(f'v.blk.{layer}.ffn_norm.{suffix}', data_torch)]
6909-
if rest.startswith('mlp.fc1.'):
6910-
suffix = parts[-1]
6911-
return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
6912-
if rest.startswith('mlp.fc2.'):
6913-
suffix = parts[-1]
6914-
return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
6915-
return None
6916-
69176872
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
6918-
"""Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper."""
6919-
# Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is
69206873
if name.startswith('v.') or name.startswith('mm.'):
69216874
return name
6922-
# Try the base mapping first
6923-
try:
6924-
return super().map_tensor_name(name, try_suffixes=try_suffixes)
6925-
except Exception:
6926-
# Fallback to legacy Jina-specific mapper for any remaining edge keys
6927-
if hasattr(self, "_map_jinaclip_tensor_name"):
6928-
mapped = self._map_jinaclip_tensor_name(name) # type: ignore[attr-defined]
6929-
if mapped:
6930-
return mapped
6931-
return name
6875+
return super().map_tensor_name(name, try_suffixes=try_suffixes)
69326876

69336877
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
69346878
yielded_any = False
@@ -6967,39 +6911,10 @@ def _should_be_f32(self, gguf_name: str) -> bool:
69676911
return any(p in gguf_name for p in patterns)
69686912

69696913
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
6970-
del bid # unused
6971-
6972-
src = name
6973-
if src.startswith('v.') or src.startswith('mm.'):
6974-
return [(src, data_torch)]
6975-
6976-
# Drop 'vision_model.' prefix if present
6977-
src_no_vm = self._strip_vm_prefix(src)
6978-
6979-
# Top-level direct mappings — use gguf constants directly for canonical names
6980-
if src_no_vm == 'cls_token':
6981-
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_CLS]
6982-
return [(base, data_torch)]
6983-
if src_no_vm.startswith('patch_embed.proj.'):
6984-
suffix = src_no_vm.split('.')[-1]
6985-
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
6986-
return [(f'{base}.{suffix}', data_torch)]
6987-
if src_no_vm == 'pos_embed':
6914+
# keep only pos_embed special case (no .weight suffix); all other tensors use table-driven mapping
6915+
if name == 'pos_embed':
69886916
pos_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_POS] + '.weight'
69896917
return [(pos_name, data_torch)]
6990-
if src_no_vm.startswith('norm.'):
6991-
suffix = src_no_vm.split('.')[-1]
6992-
base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_POST_NORM]
6993-
return [(f'{base}.{suffix}', data_torch)]
6994-
6995-
if src_no_vm.startswith('blocks.'):
6996-
parts = src_no_vm.split('.')
6997-
if len(parts) >= 3 and parts[1].isdigit():
6998-
layer = int(parts[1])
6999-
rest = '.'.join(parts[2:])
7000-
mapped = self._map_block_tensor(layer, rest, data_torch, name)
7001-
if mapped is not None:
7002-
return mapped
70036918

70046919
try:
70056920
return [(self.map_tensor_name(name), data_torch)]

gguf-py/gguf/constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,9 +634,13 @@ class MODEL_TENSOR(IntEnum):
634634
V_ENC_ATTN_O = auto()
635635
V_ENC_ATTN_O_NORM = auto()
636636
V_ENC_POST_ATTN_NORM = auto()
637+
V_ENC_ATTN_LN = auto()
637638
V_ENC_FFN_UP = auto()
638639
V_ENC_FFN_GATE = auto()
639640
V_ENC_FFN_DOWN = auto()
641+
V_ENC_FFN_NORM = auto()
642+
V_ENC_ATTN_Q_BIAS = auto()
643+
V_ENC_ATTN_V_BIAS = auto()
640644
V_LAYER_SCALE_1 = auto()
641645
V_LAYER_SCALE_2 = auto()
642646
V_PRE_NORM = auto()
@@ -1002,9 +1006,13 @@ class MODEL_TENSOR(IntEnum):
10021006
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
10031007
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
10041008
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
1009+
MODEL_TENSOR.V_ENC_ATTN_LN: "v.blk.{bid}.attn_ln",
10051010
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
10061011
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
10071012
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
1013+
MODEL_TENSOR.V_ENC_FFN_NORM: "v.blk.{bid}.ffn_norm",
1014+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS: "v.blk.{bid}.attn_q.bias",
1015+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS: "v.blk.{bid}.attn_v.bias",
10081016
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
10091017
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
10101018
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
@@ -1080,9 +1088,13 @@ class MODEL_TENSOR(IntEnum):
10801088
MODEL_TENSOR.V_ENC_ATTN_O,
10811089
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
10821090
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
1091+
MODEL_TENSOR.V_ENC_ATTN_LN,
10831092
MODEL_TENSOR.V_ENC_FFN_UP,
10841093
MODEL_TENSOR.V_ENC_FFN_GATE,
10851094
MODEL_TENSOR.V_ENC_FFN_DOWN,
1095+
MODEL_TENSOR.V_ENC_FFN_NORM,
1096+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS,
1097+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS,
10861098
MODEL_TENSOR.V_LAYER_SCALE_1,
10871099
MODEL_TENSOR.V_LAYER_SCALE_2,
10881100
MODEL_TENSOR.V_PRE_NORM,

gguf-py/gguf/tensor_mapping.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,7 @@ class TensorNameMap:
12021202
"model.vision_tower.embeddings.cls_token", # Intern-S1
12031203
"vision_model.class_embedding", # llama 4
12041204
"model.vision.patch_embedding.cls_embedding", # cogvlm
1205+
"cls_token", # JinaCLIP v2 vision
12051206
),
12061207

12071208
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -1215,6 +1216,7 @@ class TensorNameMap:
12151216
"visual.patch_embed.proj", # qwen2vl
12161217
"vision_tower.patch_embed.proj", # kimi-vl
12171218
"model.vision.patch_embedding.proj", # cogvlm
1219+
"patch_embed.proj", # JinaCLIP v2 vision
12181220
),
12191221

12201222
MODEL_TENSOR.V_ENC_EMBD_POS: (
@@ -1243,6 +1245,7 @@ class TensorNameMap:
12431245
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
12441246
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
12451247
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
1248+
"blocks.{bid}.attn.q_proj", # JinaCLIP v2 vision
12461249
),
12471250

12481251
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@@ -1260,6 +1263,7 @@ class TensorNameMap:
12601263
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
12611264
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
12621265
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
1266+
"blocks.{bid}.attn.k_proj", # JinaCLIP v2 vision
12631267
),
12641268

12651269
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@@ -1277,6 +1281,7 @@ class TensorNameMap:
12771281
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
12781282
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
12791283
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
1284+
"blocks.{bid}.attn.v_proj", # JinaCLIP v2 vision
12801285
),
12811286

12821287
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@@ -1291,6 +1296,7 @@ class TensorNameMap:
12911296
"visual.blocks.{bid}.norm1", # qwen2vl
12921297
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
12931298
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
1299+
"blocks.{bid}.norm1", # JinaCLIP v2 vision
12941300
),
12951301

12961302
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1306,6 +1312,7 @@ class TensorNameMap:
13061312
"visual.blocks.{bid}.attn.proj", # qwen2vl
13071313
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
13081314
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
1315+
"blocks.{bid}.attn.proj", # JinaCLIP v2 vision
13091316
),
13101317

13111318
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1320,6 +1327,11 @@ class TensorNameMap:
13201327
"visual.blocks.{bid}.norm2", # qwen2vl
13211328
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
13221329
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
1330+
"blocks.{bid}.norm2", # JinaCLIP v2 vision
1331+
),
1332+
1333+
MODEL_TENSOR.V_ENC_ATTN_LN: (
1334+
"blocks.{bid}.attn.inner_attn_ln", # JinaCLIP v2 vision
13231335
),
13241336

13251337
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1335,12 +1347,14 @@ class TensorNameMap:
13351347
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
13361348
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
13371349
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
1350+
"blocks.{bid}.mlp.w2", # JinaCLIP v2 vision (up)
13381351
),
13391352

13401353
MODEL_TENSOR.V_ENC_FFN_GATE: (
13411354
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
13421355
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
13431356
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
1357+
"blocks.{bid}.mlp.w1", # JinaCLIP v2 vision
13441358
),
13451359

13461360
MODEL_TENSOR.V_ENC_FFN_DOWN: (
@@ -1356,6 +1370,11 @@ class TensorNameMap:
13561370
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
13571371
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
13581372
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
1373+
"blocks.{bid}.mlp.w3", # JinaCLIP v2 vision (down)
1374+
),
1375+
1376+
MODEL_TENSOR.V_ENC_FFN_NORM: (
1377+
"blocks.{bid}.mlp.ffn_ln", # JinaCLIP v2 vision
13591378
),
13601379

13611380
MODEL_TENSOR.V_LAYER_SCALE_1: (
@@ -1368,6 +1387,14 @@ class TensorNameMap:
13681387
"model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1
13691388
),
13701389

1390+
MODEL_TENSOR.V_ENC_ATTN_Q_BIAS: (
1391+
"blocks.{bid}.attn.q_bias", # JinaCLIP v2 vision
1392+
),
1393+
1394+
MODEL_TENSOR.V_ENC_ATTN_V_BIAS: (
1395+
"blocks.{bid}.attn.v_bias", # JinaCLIP v2 vision
1396+
),
1397+
13711398
MODEL_TENSOR.V_PRE_NORM: (
13721399
"vision_tower.vision_model.pre_layrnorm",
13731400
"vision_tower.ln_pre", # pixtral-hf
@@ -1381,6 +1408,7 @@ class TensorNameMap:
13811408
"vision_model.layernorm_post", # llama4
13821409
"visual.merger.ln_q", # qwen2vl
13831410
"vision_tower.encoder.final_layernorm", # kimi-vl
1411+
"norm", # JinaCLIP v2 vision
13841412
),
13851413

13861414
MODEL_TENSOR.V_MM_INP_PROJ: (

0 commit comments

Comments
 (0)