Skip to content

Commit 47d098e

Browse files
committed
Update Llava15ChatHandler to accept use_gpu, image_min_tokens, and image_max_tokens.
Now can pass the`image_min_tokens`parameter in Qwen3VLChatHandler to support bbox grounding tasks. Add validation to ensure max tokens are not less than min tokens.
1 parent a32aa9e commit 47d098e

File tree

1 file changed

+17
-3
lines changed

1 file changed

+17
-3
lines changed

llama_cpp/llama_chat_format.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2792,10 +2792,13 @@ class Llava15ChatHandler:
27922792
"{% endif %}"
27932793
)
27942794

2795-
def __init__(self, clip_model_path: str, verbose: bool = True):
2795+
def __init__(self, clip_model_path: str, verbose: bool = True, use_gpu: bool = True, image_min_tokens: int = -1, image_max_tokens: int = -1):
27962796
import llama_cpp.mtmd_cpp as mtmd_cpp
27972797

27982798
self.clip_model_path = clip_model_path
2799+
self.image_min_tokens = image_min_tokens
2800+
self.image_max_tokens = image_max_tokens
2801+
self.use_gpu = use_gpu
27992802
self.verbose = verbose
28002803
self._mtmd_cpp = mtmd_cpp
28012804
self._exit_stack = ExitStack()
@@ -2815,10 +2818,16 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
28152818

28162819
# Get default parameters
28172820
mctx_params = self._mtmd_cpp.mtmd_context_params_default()
2818-
mctx_params.use_gpu = True # TODO: Make this configurable
2821+
mctx_params.use_gpu = self.use_gpu
28192822
mctx_params.print_timings = self.verbose
28202823
mctx_params.n_threads = llama_model.n_threads
28212824
mctx_params.flash_attn_type = self._mtmd_cpp.clip_flash_attn_type.CLIP_FLASH_ATTN_TYPE_AUTO
2825+
if self.image_min_tokens > 0:
2826+
mctx_params.image_min_tokens = self.image_min_tokens
2827+
if self.image_max_tokens > 0:
2828+
mctx_params.image_max_tokens = self.image_max_tokens
2829+
if (self.image_max_tokens < self.image_min_tokens) and self.image_max_tokens > 0:
2830+
raise ValueError(f"image_max_pixels {self.image_max_tokens} is less than image_min_pixels {self.image_min_tokens}")
28222831

28232832
# Initialize mtmd context
28242833
self.mtmd_ctx = self._mtmd_cpp.mtmd_init_from_file(
@@ -3791,6 +3800,7 @@ def __init__(
37913800
self,
37923801
force_reasoning: bool = False,
37933802
add_vision_id: bool = True,
3803+
image_min_tokens: int = -1,
37943804
**kwargs,
37953805
):
37963806
"""
@@ -3801,11 +3811,15 @@ def __init__(
38013811
- add_vision_id (bool):
38023812
- True (default): Count all the images. Recommended for multi-image.
38033813
- False: Doesn't count the images. Can save tokens with single-image.
3814+
- image_min_tokens (int):
3815+
It only takes effect when the value is greater than zero. the default value is -1 (i.e., using the default parameters in the model's preprocessor_config.json).
3816+
Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
38043817
"""
38053818
self.force_reasoning = force_reasoning
38063819
self.add_vision_id = add_vision_id
3820+
self.image_min_tokens = image_min_tokens
38073821

3808-
super().__init__(**kwargs)
3822+
super().__init__(image_min_tokens=self.image_min_tokens, **kwargs)
38093823

38103824
def __call__(self, **kwargs):
38113825
self.extra_template_arguments["force_reasoning"] = self.force_reasoning

0 commit comments

Comments
 (0)