|
2 | 2 | "cells": [ |
3 | 3 | { |
4 | 4 | "cell_type": "code", |
5 | | - "execution_count": 5, |
| 5 | + "execution_count": 1, |
6 | 6 | "metadata": {}, |
7 | 7 | "outputs": [], |
8 | 8 | "source": [ |
|
31 | 31 | }, |
32 | 32 | { |
33 | 33 | "cell_type": "code", |
34 | | - "execution_count": 7, |
| 34 | + "execution_count": 2, |
35 | 35 | "metadata": {}, |
36 | 36 | "outputs": [], |
37 | 37 | "source": [ |
|
40 | 40 | }, |
41 | 41 | { |
42 | 42 | "cell_type": "code", |
43 | | - "execution_count": 8, |
| 43 | + "execution_count": 3, |
44 | 44 | "metadata": {}, |
45 | 45 | "outputs": [], |
46 | 46 | "source": [ |
|
50 | 50 | }, |
51 | 51 | { |
52 | 52 | "cell_type": "code", |
53 | | - "execution_count": 9, |
| 53 | + "execution_count": 4, |
54 | 54 | "metadata": {}, |
55 | 55 | "outputs": [ |
56 | 56 | { |
57 | 57 | "data": { |
58 | 58 | "text/plain": [ |
59 | | - "{'decision': False,\n", |
60 | | - " 'reasoning': 'The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.',\n", |
| 59 | + "{'decision': 'PASS',\n", |
| 60 | + " 'reasoning': 'The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.',\n", |
61 | 61 | " 'score': None,\n", |
62 | 62 | " 'metadata': {'model': 'qwen2',\n", |
63 | | - " 'raw_response': '{\\n \"decision\": false,\\n \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n \"score\": null\\n}'}}" |
| 63 | + " 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.\",\\n \"score\": null\\n}'}}" |
64 | 64 | ] |
65 | 65 | }, |
66 | | - "execution_count": 9, |
| 66 | + "execution_count": 4, |
67 | 67 | "metadata": {}, |
68 | 68 | "output_type": "execute_result" |
69 | 69 | } |
|
74 | 74 | }, |
75 | 75 | { |
76 | 76 | "cell_type": "code", |
77 | | - "execution_count": 10, |
| 77 | + "execution_count": 5, |
78 | 78 | "metadata": {}, |
79 | 79 | "outputs": [ |
80 | 80 | { |
81 | 81 | "data": { |
82 | 82 | "text/plain": [ |
83 | | - "{'decision': 5,\n", |
84 | | - " 'reasoning': 'The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.',\n", |
85 | | - " 'score': 5.0,\n", |
| 83 | + "{'decision': 'False',\n", |
| 84 | + " 'reasoning': 'The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.',\n", |
| 85 | + " 'score': 0.2,\n", |
86 | 86 | " 'metadata': {'model': 'qwen2',\n", |
87 | | - " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n \"score\": 5\\n}'}}" |
| 87 | + " 'raw_response': '{\\n \"decision\": \"False\",\\n \"reasoning\": \"The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.\",\\n \"score\": 0.2\\n}'}}" |
88 | 88 | ] |
89 | 89 | }, |
90 | | - "execution_count": 10, |
| 90 | + "execution_count": 5, |
91 | 91 | "metadata": {}, |
92 | 92 | "output_type": "execute_result" |
93 | 93 | } |
94 | 94 | ], |
95 | 95 | "source": [ |
96 | 96 | "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n", |
97 | 97 | " criteria=\"Check the professional tone.\",\n", |
98 | | - " rubric=\"Assign a score between 0 and 10 based on the professional tone. 0 is the worst and 10 is the best.\")\n", |
| 98 | + " rubric=\"Assign a score between 0 and 1 based on the professional tone. 0 is the worst and 1 is the best.\")\n", |
99 | 99 | "res.model_dump()" |
100 | 100 | ] |
101 | 101 | }, |
102 | 102 | { |
103 | 103 | "cell_type": "code", |
104 | | - "execution_count": 11, |
| 104 | + "execution_count": 8, |
105 | 105 | "metadata": {}, |
106 | 106 | "outputs": [ |
107 | 107 | { |
108 | 108 | "data": { |
109 | 109 | "text/plain": [ |
110 | | - "{'decision': 5,\n", |
111 | | - " 'reasoning': 'The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.',\n", |
112 | | - " 'score': 5.0,\n", |
| 110 | + "{'decision': 'True',\n", |
| 111 | + " 'reasoning': 'The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.',\n", |
| 112 | + " 'score': 1.0,\n", |
113 | 113 | " 'metadata': {'model': 'qwen2',\n", |
114 | | - " 'raw_response': '{\\n \"decision\": 5,\\n \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n \"score\": 5\\n}'}}" |
| 114 | + " 'raw_response': '{\\n \"decision\": \"True\",\\n \"reasoning\": \"The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.\",\\n \"score\": 1\\n}'}}" |
115 | 115 | ] |
116 | 116 | }, |
117 | | - "execution_count": 11, |
| 117 | + "execution_count": 8, |
118 | 118 | "metadata": {}, |
119 | 119 | "output_type": "execute_result" |
120 | 120 | } |
|
124 | 124 | " criteria=\"Check the professional tone.\",\n", |
125 | 125 | " rubric={\n", |
126 | 126 | " 0: \"The response is not professional.\",\n", |
127 | | - " 5: \"The response is somewhat professional.\",\n", |
128 | | - " 10: \"The response is very professional.\"\n", |
129 | | - " })\n", |
| 127 | + " 0.5: \"The response is somewhat professional.\",\n", |
| 128 | + " 1: \"The response is very professional.\"\n", |
| 129 | + " },\n", |
| 130 | + " scale=(0, 1)\n", |
| 131 | + " )\n", |
130 | 132 | "res.model_dump()" |
131 | 133 | ] |
132 | 134 | }, |
133 | 135 | { |
134 | 136 | "cell_type": "code", |
135 | | - "execution_count": 12, |
| 137 | + "execution_count": 9, |
136 | 138 | "metadata": {}, |
137 | 139 | "outputs": [], |
138 | 140 | "source": [ |
|
146 | 148 | }, |
147 | 149 | { |
148 | 150 | "cell_type": "code", |
149 | | - "execution_count": 13, |
| 151 | + "execution_count": 10, |
150 | 152 | "metadata": {}, |
151 | 153 | "outputs": [ |
152 | 154 | { |
153 | 155 | "data": { |
154 | 156 | "text/plain": [ |
155 | 157 | "{'decision': 'moderate',\n", |
156 | | - " 'reasoning': 'The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.',\n", |
| 158 | + " 'reasoning': 'The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.',\n", |
157 | 159 | " 'score': 5.0,\n", |
158 | 160 | " 'metadata': {'model': 'qwen2',\n", |
159 | | - " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n \"score\": 5\\n}'}}" |
| 161 | + " 'raw_response': '{\\n \"decision\": \"moderate\",\\n \"reasoning\": \"The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.\",\\n \"score\": 5\\n}'}}" |
160 | 162 | ] |
161 | 163 | }, |
162 | | - "execution_count": 13, |
| 164 | + "execution_count": 10, |
163 | 165 | "metadata": {}, |
164 | 166 | "output_type": "execute_result" |
165 | 167 | } |
|
172 | 174 | }, |
173 | 175 | { |
174 | 176 | "cell_type": "code", |
175 | | - "execution_count": 14, |
| 177 | + "execution_count": 11, |
176 | 178 | "metadata": {}, |
177 | 179 | "outputs": [ |
178 | 180 | { |
179 | 181 | "data": { |
180 | 182 | "text/plain": [ |
181 | 183 | "{'decision': 'non-professional',\n", |
182 | | - " 'reasoning': 'The response uses informal and expletive language, which is not appropriate for a professional context.',\n", |
183 | | - " 'score': 1.0,\n", |
| 184 | + " 'reasoning': \"The phrase 'Holy shit, this is a great!' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\n", |
| 185 | + " 'score': 2.0,\n", |
184 | 186 | " 'metadata': {'model': 'qwen2',\n", |
185 | | - " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n \"score\": 1\\n}'}}" |
| 187 | + " 'raw_response': '{\\n \"decision\": \"non-professional\",\\n \"reasoning\": \"The phrase \\'Holy shit, this is a great!\\' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\\n \"score\": 2\\n}'}}" |
186 | 188 | ] |
187 | 189 | }, |
188 | | - "execution_count": 14, |
| 190 | + "execution_count": 11, |
189 | 191 | "metadata": {}, |
190 | 192 | "output_type": "execute_result" |
191 | 193 | } |
|
198 | 200 | }, |
199 | 201 | { |
200 | 202 | "cell_type": "code", |
201 | | - "execution_count": 15, |
| 203 | + "execution_count": 12, |
202 | 204 | "metadata": {}, |
203 | 205 | "outputs": [ |
204 | 206 | { |
205 | 207 | "data": { |
206 | 208 | "text/plain": [ |
207 | | - "{'decision': True,\n", |
208 | | - " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n", |
| 209 | + "{'decision': 'PASS',\n", |
| 210 | + " 'reasoning': 'The statement is accurate and complete as it correctly identifies Paris as the capital of France.',\n", |
209 | 211 | " 'score': None,\n", |
210 | 212 | " 'metadata': {'model': 'qwen2',\n", |
211 | | - " 'raw_response': '{\\n \"decision\": true,\\n \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n \"score\": null\\n}',\n", |
| 213 | + " 'raw_response': '{\\n \"decision\": \"PASS\",\\n \"reasoning\": \"The statement is accurate and complete as it correctly identifies Paris as the capital of France.\",\\n \"score\": null\\n}',\n", |
212 | 214 | " 'template_vars': {'input': 'What is the capital of France?'},\n", |
213 | 215 | " 'template_engine': 'format'}}" |
214 | 216 | ] |
215 | 217 | }, |
216 | | - "execution_count": 15, |
| 218 | + "execution_count": 12, |
217 | 219 | "metadata": {}, |
218 | 220 | "output_type": "execute_result" |
219 | 221 | } |
|
0 commit comments