diff --git a/tools/who_what_benchmark/tests/test_cli_embeddings.py b/tools/who_what_benchmark/tests/test_cli_embeddings.py index 9b793224c8..e7ff65ebbd 100644 --- a/tools/who_what_benchmark/tests/test_cli_embeddings.py +++ b/tools/who_what_benchmark/tests/test_cli_embeddings.py @@ -1,14 +1,21 @@ import subprocess # nosec B404 import sys import pytest +import shutil import logging -from test_cli_image import run_wwb +from pathlib import Path +from test_cli_image import run_wwb, get_similarity logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +def remove_artifacts(artifacts_path: Path, file_type="outputs"): + logger.info(f"Remove {file_type}") + shutil.rmtree(artifacts_path) + + @pytest.mark.parametrize( ("model_id", "model_type"), [ @@ -21,6 +28,7 @@ def test_embeddings_basic(model_id, model_type, tmp_path): GT_FILE = tmp_path / "gt.csv" MODEL_PATH = tmp_path / model_id.replace("/", "_") + SIMILARITY_THRESHOLD = 0.99 result = subprocess.run(["optimum-cli", "export", "openvino", "-m", model_id, @@ -47,8 +55,9 @@ def test_embeddings_basic(model_id, model_type, tmp_path): "--hf", ]) + outputs_path = tmp_path / "optimum" # test Optimum - run_wwb([ + outputs = run_wwb([ "--target-model", MODEL_PATH, "--num-samples", @@ -59,10 +68,24 @@ def test_embeddings_basic(model_id, model_type, tmp_path): "CPU", "--model-type", model_type, + "--output", + outputs_path, ]) + assert (outputs_path / "target").exists() + assert (outputs_path / "target.csv").exists() + assert (outputs_path / "metrics_per_question.csv").exists() + assert (outputs_path / "metrics.csv").exists() + assert "Metrics for model" in outputs + + similarity = get_similarity(outputs) + assert similarity >= SIMILARITY_THRESHOLD + + remove_artifacts(outputs_path) + + outputs_path = tmp_path / "genai" # test GenAI - run_wwb([ + outputs = run_wwb([ "--target-model", MODEL_PATH, "--num-samples", @@ -75,13 +98,22 @@ def test_embeddings_basic(model_id, model_type, tmp_path): model_type, "--genai", "--output", - tmp_path, + outputs_path, ]) + assert (outputs_path / "target").exists() + assert (outputs_path / "target.csv").exists() + assert (outputs_path / "metrics_per_question.csv").exists() + assert (outputs_path / "metrics.csv").exists() + assert "Metrics for model" in outputs + + similarity = get_similarity(outputs) + assert similarity >= SIMILARITY_THRESHOLD + # test w/o models run_wwb([ "--target-data", - tmp_path / "target.csv", + outputs_path / "target.csv", "--num-samples", "1", "--gt-data", @@ -92,3 +124,6 @@ def test_embeddings_basic(model_id, model_type, tmp_path): model_type, "--genai", ]) + + remove_artifacts(outputs_path) + remove_artifacts(MODEL_PATH, "model") diff --git a/tools/who_what_benchmark/tests/test_cli_reranking.py b/tools/who_what_benchmark/tests/test_cli_reranking.py index 990b31ddde..ca5f5eb298 100644 --- a/tools/who_what_benchmark/tests/test_cli_reranking.py +++ b/tools/who_what_benchmark/tests/test_cli_reranking.py @@ -4,7 +4,7 @@ import shutil import logging import tempfile -from test_cli_image import run_wwb +from test_cli_image import run_wwb, get_similarity from pathlib import Path @@ -13,40 +13,36 @@ tmp_dir = tempfile.mkdtemp() -OV_RERANK_MODELS = { - ("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-classification"), - ("Qwen/Qwen3-Reranker-0.6B", "text-generation"), -} +def download_model(model_id, task, tmp_path): + MODEL_PATH = Path(tmp_path, model_id.replace("/", "_")) + subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH, "--task", task], + capture_output=True, + text=True) + return MODEL_PATH -def setup_module(): - for model_info in OV_RERANK_MODELS: - model_id = model_info[0] - task = model_info[1] - MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_")) - subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH, "--task", task, "--trust-remote-code"], - capture_output=True, - text=True) +def remove_artifacts(artifacts_path: Path, file_type="outputs"): + shutil.rmtree(artifacts_path) -def teardown_module(): - logger.info("Remove models") - shutil.rmtree(tmp_dir) - - -@pytest.mark.parametrize(("model_info"), OV_RERANK_MODELS) -def test_reranking_genai(model_info, tmp_path): - if sys.platform == 'darwin': - pytest.xfail("Ticket 175534") - +@pytest.mark.wwb_rerank +@pytest.mark.parametrize( + ("model_id", "model_task", "threshold"), + [ + ("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-classification", 0.6), + ("tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "text-classification", 0.6), + ("Qwen/Qwen3-Reranker-0.6B", "text-generation", 0.6), + ], +) +@pytest.mark.xfail(sys.platform == 'darwin', reason="Hangs. Ticket 175534", run=False) +def test_reranking_optimum(model_id, model_task, threshold, tmp_path): GT_FILE = Path(tmp_dir) / "gt.csv" - model_id = model_info[0] - MODEL_PATH = Path(tmp_dir) / model_id.replace("/", "_") + MODEL_PATH = download_model(model_id, model_task, tmp_path) - # test GenAI + # Collect reference with HF model run_wwb([ "--base-model", - MODEL_PATH, + model_id, "--num-samples", "1", "--gt-data", @@ -55,25 +51,17 @@ def test_reranking_genai(model_info, tmp_path): "CPU", "--model-type", "text-reranking", - "--genai" + "--hf", ]) + assert GT_FILE.exists() assert Path(tmp_dir, "reference").exists() - -@pytest.mark.parametrize( - ("model_info"), OV_RERANK_MODELS -) -@pytest.mark.xfail(sys.platform == 'darwin', reason="Hangs. Ticket 175534", run=False) -def test_reranking_optimum(model_info, tmp_path): - GT_FILE = Path(tmp_dir) / "gt.csv" - model_id = model_info[0] - MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_")) - - # Collect reference with HF model - run_wwb([ - "--base-model", - model_id, + outputs_path = tmp_path / "optimum" + # test Optimum + outputs_optimum = run_wwb([ + "--target-model", + MODEL_PATH, "--num-samples", "1", "--gt-data", @@ -82,14 +70,24 @@ def test_reranking_optimum(model_info, tmp_path): "CPU", "--model-type", "text-reranking", - "--hf", + "--output", + outputs_path, ]) - assert GT_FILE.exists() - assert Path(tmp_dir, "reference").exists() + assert (outputs_path / "target").exists() + assert (outputs_path / "target.csv").exists() + assert (outputs_path / "metrics_per_question.csv").exists() + assert (outputs_path / "metrics.csv").exists() + assert "Metrics for model" in outputs_optimum - # test Optimum - outpus = run_wwb([ + similarity = get_similarity(outputs_optimum) + assert similarity >= threshold + + remove_artifacts(outputs_path) + + outputs_path = tmp_path / "genai" + # test GenAI + outputs_genai = run_wwb([ "--target-model", MODEL_PATH, "--num-samples", @@ -100,20 +98,23 @@ def test_reranking_optimum(model_info, tmp_path): "CPU", "--model-type", "text-reranking", + "--genai", "--output", - tmp_path, + outputs_path, ]) + assert (outputs_path / "target").exists() + assert (outputs_path / "target.csv").exists() + assert (outputs_path / "metrics_per_question.csv").exists() + assert (outputs_path / "metrics.csv").exists() + assert "Metrics for model" in outputs_genai - assert (tmp_path / "target").exists() - assert (tmp_path / "target.csv").exists() - assert (tmp_path / "metrics_per_question.csv").exists() - assert (tmp_path / "metrics.csv").exists() - assert "Metrics for model" in outpus + similarity = get_similarity(outputs_genai) + assert similarity >= threshold # test w/o models run_wwb([ "--target-data", - tmp_path / "target.csv", + outputs_path / "target.csv", "--num-samples", "1", "--gt-data", @@ -124,3 +125,6 @@ def test_reranking_optimum(model_info, tmp_path): "text-reranking", "--genai" ]) + + remove_artifacts(outputs_path) + remove_artifacts(MODEL_PATH, "model") diff --git a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py index bf7923beba..8009dc8c34 100644 --- a/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py +++ b/tools/who_what_benchmark/whowhatbench/whowhat_metrics.py @@ -7,6 +7,7 @@ from PIL import Image import torch import torch.nn.functional as F +from sklearn.metrics.pairwise import cosine_similarity import numpy as np from sentence_transformers import SentenceTransformer, util @@ -189,9 +190,10 @@ def evaluate(self, data_gold, data_prediction): with open(prediction, 'rb') as f: prediction_data = np.load(f) - cos_sim = F.cosine_similarity(torch.from_numpy(gold_data), torch.from_numpy(prediction_data)) - metric_per_passages.append(cos_sim.detach().numpy()) - metric_per_gen.append(torch.mean(cos_sim).item()) + cos_sim_all = cosine_similarity(gold_data, prediction_data) + cos_sim = np.diag(cos_sim_all) + metric_per_passages.append(cos_sim) + metric_per_gen.append(np.mean(cos_sim)) metric_dict = {"similarity": np.mean(metric_per_gen)} return metric_dict, {"similarity": metric_per_gen, "similarity_per_passages": metric_per_passages} @@ -222,11 +224,11 @@ def evaluate(self, data_gold, data_prediction): scores_diff = self.MISSING_DOCUMENT_PENALTY if document_idx in prediction_scores: scores_diff = abs(gold_score - prediction_scores[document_idx]) - per_query_text.append(scores_diff) + per_query_text.append(scores_diff.item()) metric_per_query.append(per_query_text) dist = np.linalg.norm(per_query_text) similarity_per_query.append(1 / (1 + dist)) metric_dict = {"similarity": np.mean(similarity_per_query)} - return metric_dict, {"similarity": similarity_per_query, "per_text_score_list": metric_per_query} + return metric_dict, {"similarity": similarity_per_query, "per_text_scores_diff": metric_per_query} diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 73b975b439..2bc113c757 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -658,7 +658,7 @@ def print_embeds_results(evaluator): ) logger.info(f"Top-{i+1} example:") logger.info("## Passages num:\n%s\n", len(e["passages"])) - logger.info("## Similarity:\n%s\n", e["similarity"]) + logger.info(f"## Similarity:\n{e['similarity']:.5}\n") def read_cb_config(path): @@ -687,8 +687,8 @@ def print_rag_results(evaluator): logger.info(f"Top-{i+1} example:") logger.info("## Query:\n%s\n", e["query"]) logger.info("## Passages num:\n%s\n", len(e["passages"])) - logger.info("## Similarity:\n%s\n", e["similarity"]) - logger.info("## Top_n scores:\n%s\n", e["per_text_score_list"]) + logger.info(f"## Similarity:\n{e['similarity']:.5}\n") + logger.info("## Difference in scores pre texts:\n%s\n", e['per_text_scores_diff']) def main():