Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 40 additions & 5 deletions tools/who_what_benchmark/tests/test_cli_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
import subprocess # nosec B404
import sys
import pytest
import shutil
import logging
from test_cli_image import run_wwb
from pathlib import Path
from test_cli_image import run_wwb, get_similarity


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def remove_artifacts(artifacts_path: Path, file_type="outputs"):
logger.info(f"Remove {file_type}")
shutil.rmtree(artifacts_path)


@pytest.mark.parametrize(
("model_id", "model_type"),
[
Expand All @@ -21,6 +28,7 @@
def test_embeddings_basic(model_id, model_type, tmp_path):
GT_FILE = tmp_path / "gt.csv"
MODEL_PATH = tmp_path / model_id.replace("/", "_")
SIMILARITY_THRESHOLD = 0.99

result = subprocess.run(["optimum-cli", "export",
"openvino", "-m", model_id,
Expand All @@ -47,8 +55,9 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
"--hf",
])

outputs_path = tmp_path / "optimum"
# test Optimum
run_wwb([
outputs = run_wwb([
"--target-model",
MODEL_PATH,
"--num-samples",
Expand All @@ -59,10 +68,24 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
"CPU",
"--model-type",
model_type,
"--output",
outputs_path,
])

assert (outputs_path / "target").exists()
assert (outputs_path / "target.csv").exists()
assert (outputs_path / "metrics_per_question.csv").exists()
assert (outputs_path / "metrics.csv").exists()
assert "Metrics for model" in outputs

similarity = get_similarity(outputs)
assert similarity >= SIMILARITY_THRESHOLD

remove_artifacts(outputs_path)

outputs_path = tmp_path / "genai"
# test GenAI
run_wwb([
outputs = run_wwb([
"--target-model",
MODEL_PATH,
"--num-samples",
Expand All @@ -75,13 +98,22 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
model_type,
"--genai",
"--output",
tmp_path,
outputs_path,
])

assert (outputs_path / "target").exists()
assert (outputs_path / "target.csv").exists()
assert (outputs_path / "metrics_per_question.csv").exists()
assert (outputs_path / "metrics.csv").exists()
assert "Metrics for model" in outputs

similarity = get_similarity(outputs)
assert similarity >= SIMILARITY_THRESHOLD

# test w/o models
run_wwb([
"--target-data",
tmp_path / "target.csv",
outputs_path / "target.csv",
"--num-samples",
"1",
"--gt-data",
Expand All @@ -92,3 +124,6 @@ def test_embeddings_basic(model_id, model_type, tmp_path):
model_type,
"--genai",
])

remove_artifacts(outputs_path)
remove_artifacts(MODEL_PATH, "model")
112 changes: 58 additions & 54 deletions tools/who_what_benchmark/tests/test_cli_reranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import shutil
import logging
import tempfile
from test_cli_image import run_wwb
from test_cli_image import run_wwb, get_similarity
from pathlib import Path


Expand All @@ -13,40 +13,36 @@
tmp_dir = tempfile.mkdtemp()


OV_RERANK_MODELS = {
("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-classification"),
("Qwen/Qwen3-Reranker-0.6B", "text-generation"),
}
def download_model(model_id, task, tmp_path):
MODEL_PATH = Path(tmp_path, model_id.replace("/", "_"))
subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH, "--task", task],
capture_output=True,
text=True)
return MODEL_PATH


def setup_module():
for model_info in OV_RERANK_MODELS:
model_id = model_info[0]
task = model_info[1]
MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_"))
subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH, "--task", task, "--trust-remote-code"],
capture_output=True,
text=True)
def remove_artifacts(artifacts_path: Path, file_type="outputs"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use file_type or remove from arguments

Suggested change
def remove_artifacts(artifacts_path: Path, file_type="outputs"):
def remove_artifacts(artifacts_path: Path):

shutil.rmtree(artifacts_path)


def teardown_module():
logger.info("Remove models")
shutil.rmtree(tmp_dir)


@pytest.mark.parametrize(("model_info"), OV_RERANK_MODELS)
def test_reranking_genai(model_info, tmp_path):
if sys.platform == 'darwin':
pytest.xfail("Ticket 175534")

@pytest.mark.wwb_rerank
@pytest.mark.parametrize(
("model_id", "model_task", "threshold"),
[
("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-classification", 0.6),
("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "text-classification", 0.6),
("Qwen/Qwen3-Reranker-0.6B", "text-generation", 0.6),
],
)
@pytest.mark.xfail(sys.platform == 'darwin', reason="Hangs. Ticket 175534", run=False)
def test_reranking_optimum(model_id, model_task, threshold, tmp_path):
GT_FILE = Path(tmp_dir) / "gt.csv"
model_id = model_info[0]
MODEL_PATH = Path(tmp_dir) / model_id.replace("/", "_")
MODEL_PATH = download_model(model_id, model_task, tmp_path)

# test GenAI
# Collect reference with HF model
run_wwb([
"--base-model",
MODEL_PATH,
model_id,
"--num-samples",
"1",
"--gt-data",
Expand All @@ -55,25 +51,17 @@ def test_reranking_genai(model_info, tmp_path):
"CPU",
"--model-type",
"text-reranking",
"--genai"
"--hf",
])

assert GT_FILE.exists()
assert Path(tmp_dir, "reference").exists()


@pytest.mark.parametrize(
("model_info"), OV_RERANK_MODELS
)
@pytest.mark.xfail(sys.platform == 'darwin', reason="Hangs. Ticket 175534", run=False)
def test_reranking_optimum(model_info, tmp_path):
GT_FILE = Path(tmp_dir) / "gt.csv"
model_id = model_info[0]
MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_"))

# Collect reference with HF model
run_wwb([
"--base-model",
model_id,
outputs_path = tmp_path / "optimum"
# test Optimum
outputs_optimum = run_wwb([
"--target-model",
MODEL_PATH,
"--num-samples",
"1",
"--gt-data",
Expand All @@ -82,14 +70,24 @@ def test_reranking_optimum(model_info, tmp_path):
"CPU",
"--model-type",
"text-reranking",
"--hf",
"--output",
outputs_path,
])

assert GT_FILE.exists()
assert Path(tmp_dir, "reference").exists()
assert (outputs_path / "target").exists()
assert (outputs_path / "target.csv").exists()
assert (outputs_path / "metrics_per_question.csv").exists()
assert (outputs_path / "metrics.csv").exists()
assert "Metrics for model" in outputs_optimum

# test Optimum
outpus = run_wwb([
similarity = get_similarity(outputs_optimum)
assert similarity >= threshold

remove_artifacts(outputs_path)

outputs_path = tmp_path / "genai"
# test GenAI
outputs_genai = run_wwb([
"--target-model",
MODEL_PATH,
"--num-samples",
Expand All @@ -100,20 +98,23 @@ def test_reranking_optimum(model_info, tmp_path):
"CPU",
"--model-type",
"text-reranking",
"--genai",
"--output",
tmp_path,
outputs_path,
])
assert (outputs_path / "target").exists()
assert (outputs_path / "target.csv").exists()
assert (outputs_path / "metrics_per_question.csv").exists()
assert (outputs_path / "metrics.csv").exists()
assert "Metrics for model" in outputs_genai

assert (tmp_path / "target").exists()
assert (tmp_path / "target.csv").exists()
assert (tmp_path / "metrics_per_question.csv").exists()
assert (tmp_path / "metrics.csv").exists()
assert "Metrics for model" in outpus
similarity = get_similarity(outputs_genai)
assert similarity >= threshold

# test w/o models
run_wwb([
"--target-data",
tmp_path / "target.csv",
outputs_path / "target.csv",
"--num-samples",
"1",
"--gt-data",
Expand All @@ -124,3 +125,6 @@ def test_reranking_optimum(model_info, tmp_path):
"text-reranking",
"--genai"
])

remove_artifacts(outputs_path)
remove_artifacts(MODEL_PATH, "model")
12 changes: 7 additions & 5 deletions tools/who_what_benchmark/whowhatbench/whowhat_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from PIL import Image
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from sentence_transformers import SentenceTransformer, util
Expand Down Expand Up @@ -189,9 +190,10 @@ def evaluate(self, data_gold, data_prediction):
with open(prediction, 'rb') as f:
prediction_data = np.load(f)

cos_sim = F.cosine_similarity(torch.from_numpy(gold_data), torch.from_numpy(prediction_data))
metric_per_passages.append(cos_sim.detach().numpy())
metric_per_gen.append(torch.mean(cos_sim).item())
cos_sim_all = cosine_similarity(gold_data, prediction_data)
cos_sim = np.diag(cos_sim_all)
metric_per_passages.append(cos_sim)
metric_per_gen.append(np.mean(cos_sim))

metric_dict = {"similarity": np.mean(metric_per_gen)}
return metric_dict, {"similarity": metric_per_gen, "similarity_per_passages": metric_per_passages}
Expand Down Expand Up @@ -222,11 +224,11 @@ def evaluate(self, data_gold, data_prediction):
scores_diff = self.MISSING_DOCUMENT_PENALTY
if document_idx in prediction_scores:
scores_diff = abs(gold_score - prediction_scores[document_idx])
per_query_text.append(scores_diff)
per_query_text.append(scores_diff.item())
Copy link

Copilot AI Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling .item() on scores_diff assumes it's a tensor, but scores_diff is set to self.MISSING_DOCUMENT_PENALTY when the document is missing, which is likely a numeric constant (int/float). This will raise an AttributeError. Only call .item() when scores_diff comes from prediction_scores[document_idx].

Suggested change
per_query_text.append(scores_diff.item())
if isinstance(scores_diff, torch.Tensor):
per_query_text.append(scores_diff.item())
else:
per_query_text.append(scores_diff)

Copilot uses AI. Check for mistakes.

metric_per_query.append(per_query_text)
dist = np.linalg.norm(per_query_text)
similarity_per_query.append(1 / (1 + dist))

metric_dict = {"similarity": np.mean(similarity_per_query)}
return metric_dict, {"similarity": similarity_per_query, "per_text_score_list": metric_per_query}
return metric_dict, {"similarity": similarity_per_query, "per_text_scores_diff": metric_per_query}
6 changes: 3 additions & 3 deletions tools/who_what_benchmark/whowhatbench/wwb.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def print_embeds_results(evaluator):
)
logger.info(f"Top-{i+1} example:")
logger.info("## Passages num:\n%s\n", len(e["passages"]))
logger.info("## Similarity:\n%s\n", e["similarity"])
logger.info(f"## Similarity:\n{e['similarity']:.5}\n")


def read_cb_config(path):
Expand Down Expand Up @@ -687,8 +687,8 @@ def print_rag_results(evaluator):
logger.info(f"Top-{i+1} example:")
logger.info("## Query:\n%s\n", e["query"])
logger.info("## Passages num:\n%s\n", len(e["passages"]))
logger.info("## Similarity:\n%s\n", e["similarity"])
logger.info("## Top_n scores:\n%s\n", e["per_text_score_list"])
logger.info(f"## Similarity:\n{e['similarity']:.5}\n")
logger.info("## Difference in scores pre texts:\n%s\n", e['per_text_scores_diff'])
Copy link

Copilot AI Nov 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in log message: 'pre texts' should be 'per texts'.

Suggested change
logger.info("## Difference in scores pre texts:\n%s\n", e['per_text_scores_diff'])
logger.info("## Difference in scores per texts:\n%s\n", e['per_text_scores_diff'])

Copilot uses AI. Check for mistakes.


def main():
Expand Down
Loading