Skip to content

Commit 3cc61a5

Browse files
[WWB] Add eagle3 pipeline (#2812)
## Description Add eagle3 pipeline Ticket: CVS-170888 ## Checklist: - [ ] Tests have been updated or added to cover the new code <!--- If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [ ] This patch fully addresses the ticket. <!--- If follow-up pull requests are needed, specify in description. --> - [ ] I have made corresponding changes to the documentation --------- Co-authored-by: Chen Peter <peter.chen@intel.com>
1 parent 5979e0d commit 3cc61a5

File tree

3 files changed

+80
-6
lines changed

3 files changed

+80
-6
lines changed

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pathlib import Path
12
import logging
23
import json
34
import torch
@@ -99,6 +100,17 @@ def load_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **kwargs):
99100
ov_adapter = openvino_genai.Adapter(adapter)
100101
adapter_config.add(ov_adapter, alpha)
101102

103+
draft_model_path = kwargs.get("draft_model", '')
104+
if draft_model_path:
105+
if not Path(draft_model_path).exists():
106+
raise RuntimeError(f"Error: Draft model path does not exist: {draft_model_path}")
107+
draft_device = kwargs.get("draft_device", None) or device
108+
draft_model_load_kwargs = (
109+
{"scheduler_config": get_scheduler_config_genai(kwargs["draft_cb_config"])}
110+
if kwargs["draft_cb_config"] is not None else {}
111+
)
112+
ov_config["draft_model"] = openvino_genai.draft_model(draft_model_path, draft_device.upper(), **draft_model_load_kwargs)
113+
102114
is_continuous_batching = kwargs.get("cb_config", None) is not None
103115

104116
if is_continuous_batching:

tools/who_what_benchmark/whowhatbench/text_evaluator.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ def __init__(
3636
seqs_per_request=None,
3737
use_chat_template=None,
3838
long_prompt=False,
39-
empty_adapters=False
39+
empty_adapters=False,
40+
num_assistant_tokens=0,
41+
assistant_confidence_threshold=0.0
4042
) -> None:
4143
assert (
4244
base_model is not None or gt_data is not None
@@ -53,6 +55,8 @@ def __init__(
5355
self.seqs_per_request = seqs_per_request
5456
self.generation_fn = gen_answer_fn
5557
self.use_chat_template = use_chat_template
58+
self.num_assistant_tokens = num_assistant_tokens
59+
self.assistant_confidence_threshold = assistant_confidence_threshold
5660
if self.generation_config is not None:
5761
assert self.seqs_per_request is not None
5862
self.empty_adapters = empty_adapters
@@ -135,7 +139,8 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
135139
return res
136140

137141
def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
138-
def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False, empty_adapters=False):
142+
def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question, use_chat_template=False, empty_adapters=False,
143+
num_assistant_tokens=0, assistant_confidence_threshold=0.0):
139144
is_awq = getattr(model, "is_awq", None) is not None
140145
device = "cpu"
141146
if hasattr(model, "device"):
@@ -196,7 +201,9 @@ def default_gen_answer(model, tokenizer, prompt, max_new_tokens, crop_question,
196201
self.max_new_tokens,
197202
self._crop_question,
198203
self.use_chat_template,
199-
empty_adapters=self.empty_adapters
204+
self.empty_adapters,
205+
self.num_assistant_tokens,
206+
self.assistant_confidence_threshold
200207
)
201208
)
202209
else:

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,35 @@ def parse_args():
232232
"If the base/target model is a local path, gguf-file should be just the filename (e.g., 'model.gguf'). "
233233
"If the base/target model is a HuggingFace model ID, gguf-file should be a relative path.",
234234
)
235+
parser.add_argument(
236+
"--draft-model",
237+
default=None,
238+
help="Path to draft model folder including IR files for Speculative decoding generation.",
239+
)
240+
parser.add_argument(
241+
"--draft-device",
242+
type=str,
243+
default=None,
244+
help="Inference device for Speculative decoding of draft model, e.g. 'CPU', 'GPU'.",
245+
)
246+
parser.add_argument(
247+
"--draft-cb-config",
248+
type=str,
249+
default=None,
250+
help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model",
251+
)
252+
parser.add_argument(
253+
"--num-assistant-tokens",
254+
type=int,
255+
default=None,
256+
help="Config option num_assistant_tokens for Speculative decoding and Prompt Lookup decoding.",
257+
)
258+
parser.add_argument(
259+
"--assistant-confidence-threshold",
260+
type=float,
261+
default=None,
262+
help="Config option assistant_confidence_threshold for Speculative decoding.",
263+
)
235264

236265
return parser.parse_args()
237266

@@ -387,16 +416,26 @@ def diff_strings(a: str, b: str, *, use_loguru_colors: bool = False) -> str:
387416
return "".join(output)
388417

389418

390-
def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False, empty_adapters=False):
419+
def genai_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False, empty_adapters=False,
420+
num_assistant_tokens=0, assistant_confidence_threshold=0.0):
391421
kwargs = {}
392422
if empty_adapters:
393423
import openvino_genai
394424
kwargs["adapters"] = openvino_genai.AdapterConfig()
395425

396-
return model.generate(question, do_sample=False, max_new_tokens=max_new_tokens, apply_chat_template=use_chat_template, **kwargs)
426+
return model.generate(
427+
question,
428+
do_sample=False,
429+
max_new_tokens=max_new_tokens,
430+
apply_chat_template=use_chat_template,
431+
num_assistant_tokens=num_assistant_tokens,
432+
assistant_confidence_threshold=assistant_confidence_threshold,
433+
**kwargs,
434+
)
397435

398436

399-
def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False):
437+
def llamacpp_gen_text(model, tokenizer, question, max_new_tokens, skip_question, use_chat_template=False, num_assistant_tokens=0,
438+
assistant_confidence_threshold=0.0):
400439
if use_chat_template:
401440
output = model.create_chat_completion(messages=[{"role": "user", "content": question}], max_tokens=max_new_tokens, temperature=0.0)
402441
text = output["choices"][0]["message"]["content"]
@@ -523,6 +562,14 @@ def create_evaluator(base_model, args):
523562
gen_answer_fn=gen_answer_fn,
524563
use_chat_template=use_chat_template,
525564
long_prompt=args.long_prompt,
565+
num_assistant_tokens=(
566+
int(args.num_assistant_tokens)
567+
if args.num_assistant_tokens is not None else 0
568+
),
569+
assistant_confidence_threshold=(
570+
float(args.assistant_confidence_threshold)
571+
if args.assistant_confidence_threshold is not None else 0.0
572+
),
526573
)
527574
elif task == "text-to-image":
528575
return EvaluatorCLS(
@@ -725,6 +772,14 @@ def main():
725772
kwargs["embeds_normalize"] = args.embeds_normalize
726773
kwargs["embeds_padding_side"] = args.embeds_padding_side
727774

775+
if args.draft_model is not None:
776+
kwargs["draft_model"] = args.draft_model
777+
kwargs["draft_device"] = args.draft_device
778+
draft_cb_config = None
779+
if args.draft_cb_config is not None:
780+
draft_cb_config = read_cb_config(args.draft_cb_config)
781+
kwargs["draft_cb_config"] = draft_cb_config
782+
728783
if args.gt_data and os.path.exists(args.gt_data):
729784
evaluator = create_evaluator(None, args)
730785
else:

0 commit comments

Comments
 (0)