From 1fffbd331dd4c5b00170cb11cac4b4a7402017f8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 9 Nov 2025 19:51:12 +0100 Subject: [PATCH 1/2] mtmd: fix patch_size initialized to random value in audio models --- tools/mtmd/clip.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d1423b67f9865..42f04318d1629 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2683,6 +2683,9 @@ struct clip_model_loader { } } else if (is_audio) { get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); + // some hparams are unused, but still need to set to avoid issues + hparams.image_size = 0; + hparams.patch_size = 1; } else { GGML_ASSERT(false && "unknown modality"); From 4ac2c2093afb669b6050fa8e2213f0938c94752b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 9 Nov 2025 22:03:53 +0100 Subject: [PATCH 2/2] add default hparams --- tools/mtmd/clip.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 42f04318d1629..1d78f5954ed66 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -160,13 +160,13 @@ enum patch_merge_type { }; struct clip_hparams { - int32_t image_size; - int32_t patch_size; - int32_t n_embd; - int32_t n_ff; - int32_t projection_dim; - int32_t n_head; - int32_t n_layer; + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; // idefics3 int32_t image_longest_edge = 0; int32_t image_min_pixels = -1;