@@ -8,7 +8,7 @@ vLLM Judge uses a single `evaluate()` method that adapts to your needs:
88
99``` python
1010result = await judge.evaluate(
11- response = " ..." , # What to evaluate
11+ content = " ..." , # What to evaluate
1212 criteria = " ..." , # What to evaluate for
1313 # Optional parameters to control evaluation
1414)
@@ -23,13 +23,13 @@ The simplest form - just provide text and criteria:
2323``` python
2424# Basic evaluation
2525result = await judge.evaluate(
26- response = " The Earth is the third planet from the Sun." ,
26+ content = " The Earth is the third planet from the Sun." ,
2727 criteria = " scientific accuracy"
2828)
2929
3030# Multiple criteria
3131result = await judge.evaluate(
32- response = " Dear customer, thank you for your feedback..." ,
32+ content = " Dear customer, thank you for your feedback..." ,
3333 criteria = " professionalism, empathy, and clarity"
3434)
3535```
@@ -51,14 +51,14 @@ Control the scoring range:
5151``` python
5252# 5-point scale
5353result = await judge.evaluate(
54- response = " The product works as advertised." ,
54+ content = " The product works as advertised." ,
5555 criteria = " review helpfulness" ,
5656 scale = (1 , 5 )
5757)
5858
5959# 100-point scale for fine-grained scoring
6060result = await judge.evaluate(
61- response = essay_text,
61+ content = essay_text,
6262 criteria = " writing quality" ,
6363 scale = (0 , 100 )
6464)
@@ -70,7 +70,7 @@ Provide evaluation guidance as text:
7070
7171``` python
7272result = await judge.evaluate(
73- response = " I hate this product!" ,
73+ content = " I hate this product!" ,
7474 criteria = " sentiment analysis" ,
7575 rubric = " Classify as 'positive', 'neutral', or 'negative' based on emotional tone"
7676)
@@ -83,7 +83,7 @@ Define specific score meanings:
8383
8484``` python
8585result = await judge.evaluate(
86- response = code_snippet,
86+ content = code_snippet,
8787 criteria = " code quality" ,
8888 scale = (1 , 10 ),
8989 rubric = {
@@ -104,7 +104,7 @@ Compare two responses by providing a dictionary:
104104``` python
105105# Compare two responses
106106result = await judge.evaluate(
107- response = {
107+ content = {
108108 " a" : " Python is great for beginners due to its simple syntax." ,
109109 " b" : " Python's intuitive syntax makes it ideal for newcomers."
110110 },
@@ -114,7 +114,7 @@ result = await judge.evaluate(
114114
115115# With additional context
116116result = await judge.evaluate(
117- response = {
117+ content = {
118118 " a" : customer_response_1,
119119 " b" : customer_response_2
120120 },
@@ -131,7 +131,7 @@ Add context to improve evaluation accuracy:
131131
132132``` python
133133result = await judge.evaluate(
134- response = " Just use the default settings." ,
134+ content = " Just use the default settings." ,
135135 criteria = " helpfulness" ,
136136 context = " User asked how to configure advanced security settings"
137137)
@@ -144,7 +144,7 @@ Guide the evaluation with examples:
144144
145145``` python
146146result = await judge.evaluate(
147- response = " Your code has a bug on line 5." ,
147+ content = " Your code has a bug on line 5." ,
148148 criteria = " constructive feedback quality" ,
149149 scale = (1 , 10 ),
150150 examples = [
@@ -169,7 +169,7 @@ Take full control of the evaluator's persona:
169169``` python
170170# Expert evaluator
171171result = await judge.evaluate(
172- response = medical_advice,
172+ content = medical_advice,
173173 criteria = " medical accuracy and safety" ,
174174 system_prompt = """ You are a licensed medical professional reviewing
175175 health information for accuracy and potential harm. Be extremely
@@ -178,7 +178,7 @@ result = await judge.evaluate(
178178
179179# Specific domain expert
180180result = await judge.evaluate(
181- response = legal_document,
181+ content = legal_document,
182182 criteria = " legal compliance" ,
183183 system_prompt = """ You are a corporate lawyer specializing in GDPR
184184 compliance. Evaluate for regulatory adherence."""
@@ -193,7 +193,7 @@ When you provide a scale, you get numeric scoring:
193193
194194``` python
195195result = await judge.evaluate(
196- response = " Great product!" ,
196+ content = " Great product!" ,
197197 criteria = " review quality" ,
198198 scale = (1 , 5 )
199199)
@@ -208,7 +208,7 @@ Without a scale but with category rubric:
208208
209209``` python
210210result = await judge.evaluate(
211- response = " This might be considered offensive." ,
211+ content = " This might be considered offensive." ,
212212 criteria = " content moderation" ,
213213 rubric = " Classify as 'safe', 'warning', or 'unsafe'"
214214)
@@ -223,7 +223,7 @@ For yes/no evaluations:
223223
224224``` python
225225result = await judge.evaluate(
226- response = user_message,
226+ content = user_message,
227227 criteria = " spam detection" ,
228228 rubric = " Determine if this is 'spam' or 'not spam'"
229229)
@@ -237,7 +237,7 @@ You can request both classification and scoring:
237237
238238``` python
239239result = await judge.evaluate(
240- response = essay,
240+ content = essay,
241241 criteria = " academic quality" ,
242242 rubric = """
243243 Grade the essay:
@@ -263,7 +263,7 @@ result = await judge.evaluate(
263263async def qa_check (response : str , threshold : float = 7.0 ):
264264 """ Check if response meets quality threshold."""
265265 result = await judge.evaluate(
266- response = response,
266+ content = response,
267267 criteria = " helpfulness, accuracy, and professionalism" ,
268268 scale = (1 , 10 )
269269 )
@@ -283,7 +283,7 @@ async def qa_check(response: str, threshold: float = 7.0):
283283async def compare_models (prompt : str , response_a : str , response_b : str ):
284284 """ Compare two model responses."""
285285 result = await judge.evaluate(
286- response = {" a" : response_a, " b" : response_b},
286+ content = {" a" : response_a, " b" : response_b},
287287 criteria = " helpfulness, accuracy, and clarity" ,
288288 context = f " User prompt: { prompt} "
289289 )
@@ -310,7 +310,7 @@ async def comprehensive_evaluation(content: str):
310310 results = {}
311311 for aspect, criteria in aspects.items():
312312 result = await judge.evaluate(
313- response = content,
313+ content = content,
314314 criteria = criteria,
315315 scale = (1 , 10 )
316316 )
0 commit comments