diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..8e0dbd5f9 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "[mdx]": { + "editor.formatOnSave": false + } +} diff --git a/README.md b/README.md index 4cbe0750c..775fa584f 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,16 @@ -# Mintlify Starter Kit +# LangWatch and LangEvals Documentation -Click on `Use this template` to copy the Mintlify starter kit. The starter kit contains examples including - -- Guide pages -- Navigation -- Customizations -- API Reference pages -- Use of popular components +This is the documentation repository for the [LangWatch](https://github.com/langwatch/langwatch) and [LangEvals](https://github.com/langwatch/langevals) projects. ### Development -Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command +Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command: ``` npm i -g mintlify ``` -Run the following command at the root of your documentation (where mint.json is) +Run the following command: ``` mintlify dev @@ -24,9 +18,5 @@ mintlify dev ### Publishing Changes -Install our Github App to auto propagate changes from your repo to your deployment. Changes will be deployed to production automatically after pushing to the default branch. Find the link to install on your dashboard. - -#### Troubleshooting +Install our Github App to auto propagate changes from your repo to your deployment. Changes will be deployed to production automatically after pushing to the default branch. Find the link to install on your dashboard. -- Mintlify dev isn't running - Run `mintlify install` it'll re-install dependencies. -- Page loads as a 404 - Make sure you are running in a folder with `mint.json` diff --git a/api-reference/annotations/create-annotation-trace.mdx b/api-reference/annotations/create-annotation-trace.mdx new file mode 100644 index 000000000..bfb58ada6 --- /dev/null +++ b/api-reference/annotations/create-annotation-trace.mdx @@ -0,0 +1,4 @@ +--- +title: 'Create annotation for single trace' +openapi: 'POST /api/annotations/trace/{id}' +--- diff --git a/api-reference/annotations/delete-annotation.mdx b/api-reference/annotations/delete-annotation.mdx new file mode 100644 index 000000000..571b633c1 --- /dev/null +++ b/api-reference/annotations/delete-annotation.mdx @@ -0,0 +1,5 @@ +--- +title: 'Delete single annotation' +openapi: 'DELETE /api/annotations/{id}' +--- + diff --git a/api-reference/annotations/get-all-annotations-trace.mdx b/api-reference/annotations/get-all-annotations-trace.mdx new file mode 100644 index 000000000..d29463fcf --- /dev/null +++ b/api-reference/annotations/get-all-annotations-trace.mdx @@ -0,0 +1,4 @@ +--- +title: 'Get annotationa for single trace' +openapi: 'GET /api/annotations/trace/{id}' +--- diff --git a/api-reference/annotations/get-annotation.mdx b/api-reference/annotations/get-annotation.mdx new file mode 100644 index 000000000..e3216097f --- /dev/null +++ b/api-reference/annotations/get-annotation.mdx @@ -0,0 +1,4 @@ +--- +title: 'Get annotations' +openapi: 'GET /api/annotations' +--- diff --git a/api-reference/annotations/get-single-annotation.mdx b/api-reference/annotations/get-single-annotation.mdx new file mode 100644 index 000000000..da42bf778 --- /dev/null +++ b/api-reference/annotations/get-single-annotation.mdx @@ -0,0 +1,4 @@ +--- +title: 'Get single annotation' +openapi: 'GET /api/annotations/{id}' +--- diff --git a/api-reference/annotations/overview.mdx b/api-reference/annotations/overview.mdx new file mode 100644 index 000000000..8c42de72e --- /dev/null +++ b/api-reference/annotations/overview.mdx @@ -0,0 +1,26 @@ +--- +title: 'Overview' +description: 'Annotations are used to annotate traces with additional information' +--- + +## Intro + +With the Annotations API, you can annotate traces with additional information. This is useful if you want to add additional information to a trace, such as a comment or a thumbs up/down reaction. + +## Authentication + +To make a call to the Annotations API, you will need to pass through your LangWatch API key in the header as `X-Auth-Token`. Your API key can be found on the setup page under settings. + + +#### Allowed Methods + +- `GET /api/annotations` - Get a list of annotations +- `GET /api/annotations/:id` - Get a single annotation +- `DELETE /api/annotations/:id` - Delete a single annotation +- `PATCH /api/annotations/:id` - Update a single annotation +- `GET /api/annotations/trace/:id` - Get the annotations for a single trace +- `POST /api/annotations/trace/:id` - Create an annotation for a single trace + + + + diff --git a/api-reference/annotations/patch-annotation.mdx b/api-reference/annotations/patch-annotation.mdx new file mode 100644 index 000000000..4e04358f9 --- /dev/null +++ b/api-reference/annotations/patch-annotation.mdx @@ -0,0 +1,5 @@ +--- +title: 'Patch single annotation' +openapi: 'PATCH /api/annotations/{id}' +--- + diff --git a/api-reference/datasets/post-dataset-entries.mdx b/api-reference/datasets/post-dataset-entries.mdx new file mode 100644 index 000000000..0357260c6 --- /dev/null +++ b/api-reference/datasets/post-dataset-entries.mdx @@ -0,0 +1,4 @@ +--- +title: 'Add entries to a dataset' +openapi: 'POST /api/dataset/{slug}/entries' +--- diff --git a/api-reference/endpoint/get.mdx b/api-reference/endpoint/get.mdx deleted file mode 100644 index 56aa09ec1..000000000 --- a/api-reference/endpoint/get.mdx +++ /dev/null @@ -1,4 +0,0 @@ ---- -title: 'Get Plants' -openapi: 'GET /plants' ---- diff --git a/api-reference/openapi.json b/api-reference/openapi.json deleted file mode 100644 index b1509be04..000000000 --- a/api-reference/openapi.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "openapi": "3.0.1", - "info": { - "title": "OpenAPI Plant Store", - "description": "A sample API that uses a plant store as an example to demonstrate features in the OpenAPI specification", - "license": { - "name": "MIT" - }, - "version": "1.0.0" - }, - "servers": [ - { - "url": "http://sandbox.mintlify.com" - } - ], - "security": [ - { - "bearerAuth": [] - } - ], - "paths": { - "/plants": { - "get": { - "description": "Returns all plants from the system that the user has access to", - "parameters": [ - { - "name": "limit", - "in": "query", - "description": "The maximum number of results to return", - "schema": { - "type": "integer", - "format": "int32" - } - } - ], - "responses": { - "200": { - "description": "Plant response", - "content": { - "application/json": { - "schema": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Plant" - } - } - } - } - }, - "400": { - "description": "Unexpected error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Error" - } - } - } - } - } - }, - "post": { - "description": "Creates a new plant in the store", - "requestBody": { - "description": "Plant to add to the store", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/NewPlant" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "plant response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Plant" - } - } - } - }, - "400": { - "description": "unexpected error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Error" - } - } - } - } - } - } - }, - "/plants/{id}": { - "delete": { - "description": "Deletes a single plant based on the ID supplied", - "parameters": [ - { - "name": "id", - "in": "path", - "description": "ID of plant to delete", - "required": true, - "schema": { - "type": "integer", - "format": "int64" - } - } - ], - "responses": { - "204": { - "description": "Plant deleted", - "content": {} - }, - "400": { - "description": "unexpected error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Error" - } - } - } - } - } - } - } - }, - "components": { - "schemas": { - "Plant": { - "required": [ - "name" - ], - "type": "object", - "properties": { - "name": { - "description": "The name of the plant", - "type": "string" - }, - "tag": { - "description": "Tag to specify the type", - "type": "string" - } - } - }, - "NewPlant": { - "allOf": [ - { - "$ref": "#/components/schemas/Plant" - }, - { - "required": [ - "id" - ], - "type": "object", - "properties": { - "id": { - "description": "Identification number of the plant", - "type": "integer", - "format": "int64" - } - } - } - ] - }, - "Error": { - "required": [ - "error", - "message" - ], - "type": "object", - "properties": { - "error": { - "type": "integer", - "format": "int32" - }, - "message": { - "type": "string" - } - } - } - }, - "securitySchemes": { - "bearerAuth": { - "type": "http", - "scheme": "bearer" - } - } - } -} \ No newline at end of file diff --git a/api-reference/openapiLangWatch.json b/api-reference/openapiLangWatch.json new file mode 100644 index 000000000..2d65e2301 --- /dev/null +++ b/api-reference/openapiLangWatch.json @@ -0,0 +1,1386 @@ +{ + "openapi": "3.1.0", + "info": { + "title": "LangWatch API", + "version": "1.0.0", + "description": "LangWatch openapi spec" + }, + "servers": [ + { + "url": "https://app.langwatch.ai" + } + ], + "security": [ + { + "api_key": [] + } + ], + "paths": { + "/api/annotations": { + "get": { + "description": "Returns all annotations for project", + "responses": { + "200": { + "description": "Annotation response", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Annotation" + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/annotations/trace/{id}": { + "get": { + "description": "Returns all annotations for single trace", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of trace to fetch", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Annotation response", + "content": { + "application/json": { + "schema": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Annotation" + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + }, + "post": { + "description": "Create an annotation for a single trace", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of the trace to annotate", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "description": "Annotation data", + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "comment": { + "type": "string" + }, + "isThumbsUp": { + "type": "boolean" + }, + "email": { + "type": "string" + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Annotation created", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Annotation" + } + } + } + }, + "400": { + "description": "Invalid input", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/annotations/{id}": { + "get": { + "description": "Returns a single annotation based on the ID supplied", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of annotation to fetch", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Annotation response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Annotation" + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + }, + "delete": { + "description": "Deletes a single annotation based on the ID supplied", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of annotation to delete", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Annotation response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string" + }, + "message": { + "type": "string" + } + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + }, + "patch": { + "description": "Updates a single annotation based on the ID supplied", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of annotation to delete", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "comment": { + "type": "string" + }, + "isThumbsUp": { + "type": "boolean" + }, + "email": { + "type": "string" + } + } + } + } + } + }, + "responses": { + "200": { + "description": "Annotation response", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "status": { + "type": "string" + }, + "message": { + "type": "string" + } + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/trace/{id}": { + "get": { + "description": "Returns single trace details based on the ID supplied", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of trace to share", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Trace details with spans and evaluations", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "trace_id": { + "type": "string", + "example": "trace_BKZL_X0TKSD4oa1aBJTc_" + }, + "project_id": { + "type": "string", + "example": "KAXYxPR8MUgTcP8CF193y" + }, + "metadata": { + "type": "object", + "properties": { + "sdk_version": { + "type": "string", + "example": "0.1.11" + }, + "sdk_language": { + "type": "string", + "example": "python" + } + } + }, + "timestamps": { + "type": "object", + "properties": { + "started_at": { + "type": "integer", + "example": 1721382486868 + }, + "inserted_at": { + "type": "integer", + "example": 1721382492894 + }, + "updated_at": { + "type": "integer", + "example": 1721382492894 + } + } + }, + "input": { + "type": "object", + "properties": { + "value": { + "type": "string", + "example": "hi" + } + } + }, + "output": { + "type": "object", + "properties": { + "value": { + "type": "string", + "example": "Hey there! ๐Ÿ‘‹๐Ÿ˜Š" + } + } + }, + "metrics": { + "type": "object", + "properties": { + "first_token_ms": { + "type": "integer", + "example": 1449 + }, + "total_time_ms": { + "type": "integer", + "example": 1543 + }, + "prompt_tokens": { + "type": "integer", + "example": 20 + }, + "completion_tokens": { + "type": "integer", + "example": 7 + }, + "tokens_estimated": { + "type": "boolean", + "example": true + } + } + }, + "error": { + "type": "object", + "nullable": true, + "properties": { + "stacktrace": { + "type": "array", + "items": { + "type": "string" + } + }, + "message": { + "type": "string" + }, + "has_error": { + "type": "boolean" + } + }, + "example": null + }, + "indexing_md5s": { + "type": "array", + "items": { + "type": "string" + }, + "example": ["cccd21e0b70c706034dfd9f7772816a3"] + }, + "spans": { + "type": "array", + "items": { + "type": "object", + "properties": { + "trace_id": { + "type": "string", + "example": "trace_BKZL_X0TKSD4oa1aBJTc_" + }, + "span_id": { + "type": "string", + "example": "span_h1xUkcUJilhudDrLeQbR_" + }, + "timestamps": { + "type": "object", + "properties": { + "finished_at": { + "type": "integer", + "example": 1721382488392 + }, + "updated_at": { + "type": "integer", + "example": 1721382492027 + }, + "started_at": { + "type": "integer", + "example": 1721382486895 + }, + "first_token_at": { + "type": "integer", + "example": 1721382488317 + }, + "inserted_at": { + "type": "integer", + "example": 1721382492027 + } + } + }, + "type": { + "type": "string", + "example": "llm" + }, + "error": { + "type": "object", + "nullable": true, + "properties": { + "stacktrace": { + "type": "array", + "items": { + "type": "string" + } + }, + "message": { + "type": "string" + }, + "has_error": { + "type": "boolean" + } + }, + "example": null + }, + "params": { + "type": "object", + "properties": { + "stream": { + "type": "boolean", + "example": true + }, + "temperature": { + "type": "number", + "example": 1 + } + } + }, + "project_id": { + "type": "string", + "example": "KAXYxPR8MUgTcP8CF193y" + }, + "parent_id": { + "type": "string", + "nullable": true, + "example": "span_ijZNjUMTz3ys0Z0YKwF_T" + }, + "name": { + "type": "string", + "nullable": true, + "example": null + }, + "model": { + "type": "string", + "example": "openai/gpt-4o" + }, + "metrics": { + "type": "object", + "properties": { + "tokens_estimated": { + "type": "boolean", + "example": true + }, + "completion_tokens": { + "type": "integer", + "example": 7 + }, + "prompt_tokens": { + "type": "integer", + "example": 20 + } + } + }, + "input": { + "type": "object", + "properties": { + "type": { + "type": "string", + "example": "chat_messages" + }, + "value": { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "example": "system" + }, + "content": { + "type": "string", + "example": "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis." + } + } + }, + "example": [ + { + "role": "system", + "content": "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis." + }, + { + "role": "user", + "content": "hi" + } + ] + } + } + }, + "output": { + "type": "object", + "properties": { + "type": { + "type": "string", + "example": "chat_messages" + }, + "value": { + "type": "array", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "example": "assistant" + }, + "content": { + "type": "string", + "example": "Hey there! ๐Ÿ‘‹๐Ÿ˜Š" + } + } + }, + "example": [ + { + "role": "assistant", + "content": "Hey there! ๐Ÿ‘‹๐Ÿ˜Š" + } + ] + } + } + } + } + } + }, + "evaluations": { + "type": "array", + "items": { + "type": "object", + "properties": { + "evaluation_id": { + "type": "string", + "example": "check_VCagriZHNWICSOM09dXjM" + }, + "name": { + "type": "string", + "example": "Ragas Answer Relevancy" + }, + "type": { + "type": "string", + "example": "ragas/answer_relevancy" + }, + "trace_id": { + "type": "string", + "example": "trace_BKZL_X0TKSD4oa1aBJTc_" + }, + "project_id": { + "type": "string", + "example": "KAXYxPR8MUgTcP8CF193y" + }, + "status": { + "type": "string", + "example": "error" + }, + "timestamps": { + "type": "object", + "properties": { + "updated_at": { + "type": "integer", + "example": 1721383657788 + }, + "inserted_at": { + "type": "integer", + "example": 1721382493358 + } + } + }, + "error": { + "type": "object", + "properties": { + "stacktrace": { + "type": "array", + "items": { + "type": "string" + }, + "example": ["TypeError: fetch failed"] + }, + "message": { + "type": "string", + "example": "fetch failed" + }, + "has_error": { + "type": "boolean", + "example": true + } + } + } + } + } + } + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/trace/search": { + "post": { + "summary": "Search traces", + "description": "Search for traces based on given criteria", + "tags": ["Traces"], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SearchRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SearchResponse" + } + } + } + } + } + } + }, + "/api/trace/{id}/share": { + "post": { + "description": "Returns a public path for a trace", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of trace to share", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Public path created", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "path": { + "type": "string" + } + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/trace/{id}/unshare": { + "post": { + "description": "Deletes a public path for a trace", + "parameters": [ + { + "name": "id", + "in": "path", + "description": "ID of trace to unshare", + "required": true, + "schema": { + "type": "string" + } + } + ], + "responses": { + "200": { + "description": "Public path deleted", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "message": { + "type": "string" + } + } + } + } + } + }, + "400": { + "description": "Unexpected error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/Error" + } + } + } + } + } + } + }, + "/api/dataset/{slug}/entries": { + "post": { + "responses": {}, + "operationId": "postApiDatasetBySlugEntries", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "slug", + "required": true + } + ], + "description": "Add entries to a dataset", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/DatasetPostEntries" + } + } + } + } + } + }, + "/api/prompts": { + "get": { + "responses": {}, + "operationId": "getApiPrompts", + "parameters": [], + "description": "Get all prompts for a project" + }, + "post": { + "responses": {}, + "operationId": "postApiPrompts", + "parameters": [], + "description": "Create a new prompt", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + } + }, + "required": ["name"] + } + } + } + } + } + }, + "/api/prompts/{id}": { + "get": { + "responses": {}, + "operationId": "getApiPromptsById", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "id", + "required": true + } + ], + "description": "Get a specific prompt" + }, + "put": { + "responses": {}, + "operationId": "putApiPromptsById", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "id", + "required": true + } + ], + "description": "Update a prompt", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + } + } + } + } + } + } + }, + "delete": { + "responses": {}, + "operationId": "deleteApiPromptsById", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "id", + "required": true + } + ], + "description": "Delete a prompt" + } + }, + "/api/prompts/{id}/versions": { + "get": { + "responses": {}, + "operationId": "getApiPromptsByIdVersions", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "id", + "required": true + } + ], + "description": "Get all versions for a prompt" + }, + "post": { + "responses": {}, + "operationId": "postApiPromptsByIdVersions", + "parameters": [ + { + "schema": { + "type": "string" + }, + "in": "path", + "name": "id", + "required": true + } + ], + "description": "Create a new version for a prompt", + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "authorId": { + "type": ["string", "null"] + }, + "projectId": { + "type": "string", + "minLength": 1 + }, + "configId": { + "type": "string", + "minLength": 1 + }, + "schemaVersion": { + "type": "string", + "const": "1.0" + }, + "commitMessage": { + "type": "string" + }, + "configData": { + "type": "object", + "properties": { + "version": { + "type": "string", + "const": "1.0" + }, + "prompt": { + "type": "string", + "minLength": 1 + }, + "model": { + "type": "string", + "minLength": 1 + }, + "inputs": { + "type": "array", + "items": { + "type": "object", + "properties": { + "identifier": { + "type": "string", + "minLength": 1 + }, + "type": { + "type": "string", + "minLength": 1 + } + }, + "required": ["identifier", "type"] + }, + "minItems": 1 + }, + "outputs": { + "type": "array", + "items": { + "type": "object", + "properties": { + "identifier": { + "type": "string", + "minLength": 1 + }, + "type": { + "type": "string", + "minLength": 1 + } + }, + "required": ["identifier", "type"] + }, + "minItems": 1 + }, + "demonstrations": { + "type": "object", + "properties": { + "columns": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "minLength": 1 + }, + "type": { + "type": "string", + "minLength": 1 + } + }, + "required": ["name", "type"] + } + }, + "rows": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": {} + }, + "default": [] + } + }, + "required": ["columns"] + } + }, + "required": [ + "version", + "prompt", + "model", + "inputs", + "outputs", + "demonstrations" + ] + } + }, + "required": [ + "authorId", + "projectId", + "configId", + "schemaVersion", + "commitMessage", + "configData" + ] + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Annotation": { + "required": ["name"], + "type": "object", + "properties": { + "id": { + "description": "The ID of the annotation", + "type": "string" + }, + "projectId": { + "description": "The ID of the project", + "type": "string" + }, + "traceId": { + "description": "The ID of the trace", + "type": "string" + }, + "comment": { + "description": "The comment of the annotation", + "type": "string" + }, + "isThumbsUp": { + "description": "The thumbs up status of the annotation", + "type": "boolean" + }, + "userId": { + "description": "The ID of the user", + "type": "string" + }, + "createdAt": { + "description": "The created at of the annotation", + "type": "string" + }, + "updatedAt": { + "description": "The updated at of the annotation", + "type": "string" + }, + "email": { + "description": "The email of the user", + "type": "string" + } + } + }, + "Error": { + "required": ["error", "message"], + "type": "object", + "properties": { + "error": { + "type": "integer", + "format": "int32" + }, + "message": { + "type": "string" + } + } + }, + "SearchRequest": { + "type": "object", + "properties": { + "query": { + "type": "string" + }, + "startDate": { + "type": "string", + "format": "date-time" + }, + "endDate": { + "type": "string", + "format": "date-time" + }, + "pageSize": { + "type": "integer", + "example": 1000 + }, + "scrollId": { + "type": "string", + "example": "123" + }, + "filters": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "SearchResponse": { + "type": "object", + "properties": { + "traces": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Trace" + } + }, + "pagination": { + "$ref": "#/components/schemas/Pagination" + } + } + }, + "Trace": { + "type": "object", + "properties": { + "trace_id": { + "type": "string" + }, + "project_id": { + "type": "string" + }, + "timestamps": { + "$ref": "#/components/schemas/Timestamps" + }, + "input": { + "$ref": "#/components/schemas/Input" + }, + "output": { + "$ref": "#/components/schemas/Output" + }, + "metadata": { + "$ref": "#/components/schemas/Metadata" + }, + "metrics": { + "$ref": "#/components/schemas/Metrics" + }, + "indexing_md5s": { + "type": "array", + "items": { + "type": "string" + } + }, + "error": { + "type": "string", + "nullable": true + }, + "evaluations": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Evaluation" + } + }, + "contexts": { + "type": "array", + "items": {} + } + } + }, + "Timestamps": { + "type": "object", + "properties": { + "inserted_at": { + "type": "integer" + }, + "started_at": { + "type": "integer" + }, + "updated_at": { + "type": "integer" + } + } + }, + "Input": { + "type": "object", + "properties": { + "value": { + "type": "string" + }, + "satisfaction_score": { + "type": "number" + } + } + }, + "Output": { + "type": "object", + "properties": { + "value": { + "type": "string" + } + } + }, + "Metadata": { + "type": "object", + "properties": { + "sdk_language": { + "type": "string" + }, + "sdk_version": { + "type": "string" + } + } + }, + "Metrics": { + "type": "object", + "properties": { + "tokens_estimated": { + "type": "boolean" + }, + "completion_tokens": { + "type": "integer" + }, + "prompt_tokens": { + "type": "integer" + }, + "total_cost": { + "type": "number" + }, + "total_time_ms": { + "type": "integer" + }, + "first_token_ms": { + "type": "integer", + "nullable": true + } + } + }, + "Evaluation": { + "type": "object", + "properties": { + "evaluation_id": { + "type": "string" + }, + "score": { + "type": "number" + }, + "timestamps": { + "$ref": "#/components/schemas/EvaluationTimestamps" + }, + "evaluator_id": { + "type": "string" + }, + "name": { + "type": "string" + }, + "details": { + "type": "string" + }, + "passed": { + "type": "boolean" + }, + "label": { + "type": "string", + "nullable": true + }, + "type": { + "type": "string" + }, + "status": { + "type": "string" + } + } + }, + "EvaluationTimestamps": { + "type": "object", + "properties": { + "finished_at": { + "type": "integer" + }, + "updated_at": { + "type": "integer" + } + } + }, + "Pagination": { + "type": "object", + "properties": { + "scrollId": { + "type": "string", + "example": "123" + }, + "totalHits": { + "type": "integer", + "example": 1254 + } + } + }, + "DatasetPostEntries": { + "type": "object", + "properties": { + "entries": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": {} + }, + "example": [ + { + "input": "hi", + "output": "Hello, how can I help you today?" + } + ] + } + }, + "required": ["entries"] + } + }, + "securitySchemes": { + "api_key": { + "type": "apiKey", + "in": "header", + "name": "X-Auth-Token" + } + } + } +} diff --git a/api-reference/traces/create-public-trace-path.mdx b/api-reference/traces/create-public-trace-path.mdx new file mode 100644 index 000000000..9e31d113f --- /dev/null +++ b/api-reference/traces/create-public-trace-path.mdx @@ -0,0 +1,4 @@ +--- +title: 'Create public path for single trace' +openapi: 'POST /api/trace/{id}/share' +--- diff --git a/api-reference/traces/delete-public-trace-path.mdx b/api-reference/traces/delete-public-trace-path.mdx new file mode 100644 index 000000000..20ae59430 --- /dev/null +++ b/api-reference/traces/delete-public-trace-path.mdx @@ -0,0 +1,4 @@ +--- +title: 'Delete an existing public path for a trace' +openapi: 'POST /api/trace/{id}/unshare' +--- diff --git a/api-reference/traces/get-trace-details.mdx b/api-reference/traces/get-trace-details.mdx new file mode 100644 index 000000000..0a1dddd95 --- /dev/null +++ b/api-reference/traces/get-trace-details.mdx @@ -0,0 +1,4 @@ +--- +title: 'Get trace details' +openapi: 'GET /api/trace/{id}' +--- diff --git a/api-reference/traces/overview.mdx b/api-reference/traces/overview.mdx new file mode 100644 index 000000000..ca0d2be71 --- /dev/null +++ b/api-reference/traces/overview.mdx @@ -0,0 +1,18 @@ +--- +title: 'Overview' +description: 'A Trace is a collection of runs that are related to a single operation' +--- + +## Intro + +With the Traces API, you are able to create a public path for a trace. This is useful if you want to share a trace with a colleague. + +## Authentication + +To make a call to the Traces API, you will need to pass through your LangWatch API key in the header as `X-Auth-Token`. Your API key can be found on the setup page under settings. + + +#### Allowed Methods + +- `GET /api/trace/:id` - Get trace details +- `POST /api/trace/:id/share` - Create a public path for a trace diff --git a/api-reference/traces/search-traces.mdx b/api-reference/traces/search-traces.mdx new file mode 100644 index 000000000..4217d8bbf --- /dev/null +++ b/api-reference/traces/search-traces.mdx @@ -0,0 +1,4 @@ +--- +title: 'Search traces' +openapi: 'POST /api/trace/search' +--- diff --git a/concepts.mdx b/concepts.mdx new file mode 100644 index 000000000..473cdb045 --- /dev/null +++ b/concepts.mdx @@ -0,0 +1,50 @@ +--- +title: Concepts +--- + +Understanding LangWatch concepts can be made easier with two practical examples: an AI travel assistant and a tool for generating blog posts. Let's dive into how each core concept of LangWatch applies to these examples. + +Imagine you've created an AI travel assistant that helps users plan their trips by conversing with them to suggest destinations, find the best prices for flights, and assist with bookings. On the other hand, you also have a platform that assists users in generating, and refining blog posts, including SEO optimization. + +### Threads + +Field: `thread_id` + +A **thread** in the context of the AI travel assistant represents a complete conversation, that is, the group of all traces. It's the entire chat that groups all back-and-forth messages as the user inquires about different aspects of their travel plan. For the blog post tool, a thread could be for example the creation process of a new blog post, encapsulating all interactions that contribute to its completionโ€”from headline generation to the final SEO adjustments. + +### Traces + +Field: `trace_id` + +A **trace** in the travel assistant's example is each distinct message, for example when a user asks for the best prices for a destination, or asks if pets are allowed in the hotel. + +In the blog post tool case, a trace could be for example each time a new generation of a catchy headline option happens, or the generation of a draft for the body, or the SEO keywords generation. + +It does not matter how many steps are inside, each trace is a full end-to-end generation handled by the AI. + +The `trace_id` is by default randomly generated if you don't provide one, however, to keep control of your traces and connect them to events like [Thumbs Up/Down](./user-events/thumbs-up-down), we recommend generating a random id on your side, using, for example the [nanoid](https://pypi.org/project/nanoid/) library. + +### Spans + +Field: `span_id` + +Within each trace, **spans** represent the individual steps taken to achieve the outcome. In the travel bot scenario, a span could be a call to the LLM to suggest potential destinations, another span for querying the airline price API, and a final span for formatting the response to present to the user. For the blog post tool, one span might be the initial text generation, followed by a subsequent span for LLM to self-critiquing the content, and another span for the third LLM call refining the text based on the critique. + +### User ID + +Field: `user_id` + +The **user id** identifies the ID of the final user of the product. In the context of both the AI travel assistant and the tool for generating blog posts, it's the ID that identifies the person using the app, usually their user account ID, this allows LangWatch to track how end users are using the product. + +### Customer ID + +Field: `customer_id` + +The **customer id** is used when you provide a platform for your customers to build LLM apps for their end users. For example, it would be if your are building a platform that allow _others_ to build AI assistants for _their_ users. Having the **customer id** allows LangWatch to group all metrics and messages per customer, which allows you to access LangWatch data through our APIs to build a custom analytics dashboard for your customers, so they can see how their own LLM assistants are behaving. + +### Labels + +Field: `labels` + +You can use **labels** to organize and compare the traces sent to LangWatch for any comparison you want to do. You can for example apply different labels for different actions, for example a label `blogpost_title` for generating the blog post title and another `blogpost_keywords`, for generating keywords. You can use it for versioning as well, for example label the first implementation +version as `v1.0.0`, then do a prompt engineering to improve the AI travel planner itenerary builder, and label it as `v1.0.1`. This way you can easily focus on each different functionality or compare versions on LangWatch dashboard. diff --git a/development.mdx b/development.mdx deleted file mode 100644 index 878300893..000000000 --- a/development.mdx +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: 'Development' -description: 'Learn how to preview changes locally' ---- - - - **Prerequisite** You should have installed Node.js (version 18.10.0 or - higher). - - -Step 1. Install Mintlify on your OS: - - - -```bash npm -npm i -g mintlify -``` - -```bash yarn -yarn global add mintlify -``` - - - -Step 2. Go to the docs are located (where you can find `mint.json`) and run the following command: - -```bash -mintlify dev -``` - -The documentation website is now available at `http://localhost:3000`. - -### Custom Ports - -Mintlify uses port 3000 by default. You can use the `--port` flag to customize the port Mintlify runs on. For example, use this command to run in port 3333: - -```bash -mintlify dev --port 3333 -``` - -You will see an error like this if you try to run Mintlify in a port that's already taken: - -```md -Error: listen EADDRINUSE: address already in use :::3000 -``` - -## Mintlify Versions - -Each CLI is linked to a specific version of Mintlify. Please update the CLI if your local website looks different than production. - - - -```bash npm -npm i -g mintlify@latest -``` - -```bash yarn -yarn global upgrade mintlify -``` - - - -## Deployment - - - Unlimited editors available under the [Startup - Plan](https://mintlify.com/pricing) - - -You should see the following if the deploy successfully went through: - - - - - -## Troubleshooting - -Here's how to solve some common problems when working with the CLI. - - - - Update to Node v18. Run `mintlify install` and try again. - - -Go to the `C:/Users/Username/.mintlify/` directory and remove the `mint` -folder. Then Open the Git Bash in this location and run `git clone -https://github.com/mintlify/mint.git`. - -Repeat step 3. - - - - Try navigating to the root of your device and delete the ~/.mintlify folder. - Then run `mintlify dev` again. - - - -Curious about what changed in a CLI version? [Check out the CLI changelog.](/changelog/command-line) diff --git a/dspy-visualization/custom-optimizer.mdx b/dspy-visualization/custom-optimizer.mdx new file mode 100644 index 000000000..1966a8fa1 --- /dev/null +++ b/dspy-visualization/custom-optimizer.mdx @@ -0,0 +1,51 @@ +--- +title: Tracking Custom DSPy Optimizer +sidebarTitle: Custom Optimizer Tracking +--- + +If you are building a custom DSPy optimizer, then LangWatch won't support tracking it out of the box, but adding track to any custom optimizer is also very simple. + +## 1. Initialize LangWatch DSPy with optimizer=None + +Before the compilation step, explicitly provide `None` on the `optimizer` parameter to be able to track the steps manually: + +```python +langwatch.dspy.init(experiment="dspy-custom-optimizer-example", optimizer=None) + +compiled_rag = my_awesome_optimizer.compile(RAG(), trainset=trainset) +``` + +## 2. Track the metric function + +Either before instantiating your optimizer, or inside the compilation step, don't forget to wrap the metric function with `langwatch.dspy.track_metric` so that it's tracked: + +```python +metric = langwatch.dspy.track_metric(metric) +``` + +## 3. Track each step + +Now at each step your optimizer progresses, call `langwatch.dspy.log_step` to capture the score at the current step index, optimizer info and predictors being used on this step evaluation: + +```python +langwatch.dspy.log_step( + optimizer=DSPyOptimizer( + name="MyAwesomeOptimizer", + parameters={ + "hyperparam": 1, + }, + ), + index="1", # step index + score=0.5, + label="score", + predictors=candidate_program.predictors(), +) +``` + +The LLM calls and examples being evaluated with be tracked automatically and logged in together with calling `log_step`. + +## Wrapping up + +That's it! You should see the steps of the optimizer in the LangWatch dashboard now. + +For any questions or issues, feel free to contact our support, join our channel on [Discord](https://discord.com/invite/kT4PhDS2gH) or [open an issue](https://github.com/langwatch/langwatch/issues) on our GitHub. diff --git a/dspy-visualization/quickstart.mdx b/dspy-visualization/quickstart.mdx new file mode 100644 index 000000000..a314ea3e4 --- /dev/null +++ b/dspy-visualization/quickstart.mdx @@ -0,0 +1,65 @@ +--- +title: DSPy Visualization Quickstart +sidebarTitle: Quickstart +--- + +[](https://colab.research.google.com/github/langwatch/langwatch/blob/main/python-sdk/examples/dspy_visualization.ipynb) + +LangWatch DSPy Visualization allows you to start tracking your DSPy experiments in real-time and easily follow the progress, track costs and debug each step. + +## 1. Install the Python library + + + + ```bash + !pip install langwatch + ``` + + + ```bash + pip install langwatch + ``` + + + +## 2. Login to LangWatch + +Import and authenticate the LangWatch SDK: + +```python +import langwatch + +langwatch.login() +``` + +Be sure to login or create an account on the link that will be displayed, then provide your API key when prompted. + +## 3. Start tracking + +Before your DSPy program compilation starts, initialize langwatch with your experiment name and the optimizer to be tracked: + +```python +# Initialize langwatch for this run, to track the optimizer compilation +langwatch.dspy.init(experiment="my-awesome-experiment", optimizer=optimizer) + +compiled_rag = optimizer.compile(RAG(), trainset=trainset) +``` + +## Follow your experiment + +Open the link provided when the compilation starts or go to your [LangWatch dashboard](https://app.langwatch.com) to follow the progress of your experiments: + + + + + +## Wrapping up + +With your experiments tracked on LangWatch, now it's time to explore how is the training going, take a look at the examples, the llm calls, +the different steps and so on, so you can understand and hypothesize where you could improve your DSPy program, and keep iterating! + + +When you are ready to deploy your DSPy program, you can monitor the inference traces on LangWatch dashboard as well. Check out the [Python Integration Guide](/integration/python/guide) for more details. + + +For any questions or issues, feel free to contact our support, join our channel on [Discord](https://discord.com/invite/kT4PhDS2gH) or [open an issue](https://github.com/langwatch/langwatch/issues) on our GitHub. diff --git a/dspy-visualization/rag-visualization.mdx b/dspy-visualization/rag-visualization.mdx new file mode 100644 index 000000000..ec9f753f4 --- /dev/null +++ b/dspy-visualization/rag-visualization.mdx @@ -0,0 +1,105 @@ +--- +title: "RAG Visualization" +--- + +[](https://colab.research.google.com/github/langwatch/langevals/blob/main/notebooks/tutorials/dspy_rag.ipynb) + +In this tutorial we will explain how LangWatch can help observing optimization of RAG application with [DSPy](https://dspy-docs.vercel.app). + +## DSPy RAG Module +As an example of RAG application we will use the sample app that is provided in the official documentation of DSPy library, +you can read more by following this link - [RAG tutorial](https://dspy-docs.vercel.app/docs/tutorials/rag). + +Firstly, lets access the dataset of wiki abstracts that will be used for example RAG optimization. + +```python +import dspy + +turbo = dspy.OpenAI(model='gpt-3.5-turbo') +colbertv2_wiki17_abstracts = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts') + +dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts) + +from dspy.datasets import HotPotQA + +# Load the dataset. +dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0) + +# Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata. +trainset = [x.with_inputs('question') for x in dataset.train] +devset = [x.with_inputs('question') for x in dataset.dev] + +len(trainset), len(devset) +``` + +Next step - to define the RAG module itself. +You can explain the task and what the expected outputs mean in this context that an LLM can optimize these commands later. + +```python +class GenerateAnswer(dspy.Signature): + """Answer questions with short factoid answers.""" + + context = dspy.InputField(desc="may contain relevant facts") + question = dspy.InputField() + answer = dspy.OutputField(desc="often between 1 and 5 words") + + +class RAG(dspy.Module): + def __init__(self, num_passages=3): + super().__init__() + + self.retrieve = dspy.Retrieve(k=num_passages) + self.generate_answer = dspy.ChainOfThought(GenerateAnswer) + + def forward(self, question): + context = self.retrieve(question).passages + prediction = self.generate_answer(context=context, question=question) + return dspy.Prediction(context=context, answer=prediction.answer) +``` +Finally, you can connect to LangWatch. After running this code snippet - you will get a link that will give you access to +an `api_key` in the browser. Paste the API key into your code editor popup and press enter - **now you are connected to LangWatch**. + +```python +import langwatch + +langwatch.endpoint = "https://app.langwatch.ai" +langwatch.login() +``` + +Last step is to actually run the prompt optitmizer. In this example `BootstrapFewShot` is used and it will +bootstrap our prompt with the best demos from our dataset. + +```python +from dspy.teleprompt import BootstrapFewShot +from dspy import evaluate +from dotenv import load_dotenv +load_dotenv() + +# Validation logic: check that the predicted answer is correct. +# Also check that the retrieved context does actually contain that answer. +def validate_context_and_answer(example, pred, trace=None): + answer_EM = evaluate.answer_exact_match(example, pred) + answer_PM = evaluate.answer_passage_match(example, pred) + return answer_EM and answer_PM + +# Set up a basic teleprompter, which will compile our RAG program. +teleprompter = BootstrapFewShot(metric=validate_context_and_answer) + +langwatch.dspy.init(experiment="rag-dspy-tutorial", optimizer=teleprompter) + +# Compile! +compiled_rag = teleprompter.compile(RAG(), trainset=trainset) +``` + +The result of optimization can be found on your LangWatch dashboard. On the graph you can see how many demos were boostrapped during the first optimization step. +DSPy Experiment Dashboard + + +Additionally, you can see each LLM call that has been done during the optimization with the corresponding costs and token counts. +DSPy LLM calls + + + + You can access and run the code yourself in Jupyter Notebook + + diff --git a/essentials/images.mdx b/essentials/images.mdx deleted file mode 100644 index 60ad42d38..000000000 --- a/essentials/images.mdx +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: 'Images and Embeds' -description: 'Add image, video, and other HTML elements' -icon: 'image' ---- - - - -## Image - -### Using Markdown - -The [markdown syntax](https://www.markdownguide.org/basic-syntax/#images) lets you add images using the following code - -```md -![title](/path/image.jpg) -``` - -Note that the image file size must be less than 5MB. Otherwise, we recommend hosting on a service like [Cloudinary](https://cloudinary.com/) or [S3](https://aws.amazon.com/s3/). You can then use that URL and embed. - -### Using Embeds - -To get more customizability with images, you can also use [embeds](/writing-content/embed) to add images - -```html - -``` - -## Embeds and HTML elements - - - -
- - - -Mintlify supports [HTML tags in Markdown](https://www.markdownguide.org/basic-syntax/#html). This is helpful if you prefer HTML tags to Markdown syntax, and lets you create documentation with infinite flexibility. - - - -### iFrames - -Loads another HTML page within the document. Most commonly used for embedding videos. - -```html - -``` diff --git a/essentials/settings.mdx b/essentials/settings.mdx deleted file mode 100644 index d9dd2d7e1..000000000 --- a/essentials/settings.mdx +++ /dev/null @@ -1,318 +0,0 @@ ---- -title: 'Global Settings' -description: 'Mintlify gives you complete control over the look and feel of your documentation using the mint.json file' -icon: 'gear' ---- - -Every Mintlify site needs a `mint.json` file with the core configuration settings. Learn more about the [properties](#properties) below. - -## Properties - - -Name of your project. Used for the global title. - -Example: `mintlify` - - - - - An array of groups with all the pages within that group - - - The name of the group. - - Example: `Settings` - - - - The relative paths to the markdown files that will serve as pages. - - Example: `["customization", "page"]` - - - - - - - - Path to logo image or object with path to "light" and "dark" mode logo images - - - Path to the logo in light mode - - - Path to the logo in dark mode - - - Where clicking on the logo links you to - - - - - - Path to the favicon image - - - - Hex color codes for your global theme - - - The primary color. Used for most often for highlighted content, section - headers, accents, in light mode - - - The primary color for dark mode. Used for most often for highlighted - content, section headers, accents, in dark mode - - - The primary color for important buttons - - - The color of the background in both light and dark mode - - - The hex color code of the background in light mode - - - The hex color code of the background in dark mode - - - - - - - - Array of `name`s and `url`s of links you want to include in the topbar - - - The name of the button. - - Example: `Contact us` - - - The url once you click on the button. Example: `https://mintlify.com/contact` - - - - - - - - - Link shows a button. GitHub shows the repo information at the url provided including the number of GitHub stars. - - - If `link`: What the button links to. - - If `github`: Link to the repository to load GitHub information from. - - - Text inside the button. Only required if `type` is a `link`. - - - - - - - Array of version names. Only use this if you want to show different versions - of docs with a dropdown in the navigation bar. - - - - An array of the anchors, includes the `icon`, `color`, and `url`. - - - The [Font Awesome](https://fontawesome.com/search?s=brands%2Cduotone) icon used to feature the anchor. - - Example: `comments` - - - The name of the anchor label. - - Example: `Community` - - - The start of the URL that marks what pages go in the anchor. Generally, this is the name of the folder you put your pages in. - - - The hex color of the anchor icon background. Can also be a gradient if you pass an object with the properties `from` and `to` that are each a hex color. - - - Used if you want to hide an anchor until the correct docs version is selected. - - - Pass `true` if you want to hide the anchor until you directly link someone to docs inside it. - - - One of: "brands", "duotone", "light", "sharp-solid", "solid", or "thin" - - - - - - - Override the default configurations for the top-most anchor. - - - The name of the top-most anchor - - - Font Awesome icon. - - - One of: "brands", "duotone", "light", "sharp-solid", "solid", or "thin" - - - - - - An array of navigational tabs. - - - The name of the tab label. - - - The start of the URL that marks what pages go in the tab. Generally, this - is the name of the folder you put your pages in. - - - - - - Configuration for API settings. Learn more about API pages at [API Components](/api-playground/demo). - - - The base url for all API endpoints. If `baseUrl` is an array, it will enable for multiple base url - options that the user can toggle. - - - - - - The authentication strategy used for all API endpoints. - - - The name of the authentication parameter used in the API playground. - - If method is `basic`, the format should be `[usernameName]:[passwordName]` - - - The default value that's designed to be a prefix for the authentication input field. - - E.g. If an `inputPrefix` of `AuthKey` would inherit the default input result of the authentication field as `AuthKey`. - - - - - - Configurations for the API playground - - - - Whether the playground is showing, hidden, or only displaying the endpoint with no added user interactivity `simple` - - Learn more at the [playground guides](/api-playground/demo) - - - - - - Enabling this flag ensures that key ordering in OpenAPI pages matches the key ordering defined in the OpenAPI file. - - This behavior will soon be enabled by default, at which point this field will be deprecated. - - - - - - - A string or an array of strings of URL(s) or relative path(s) pointing to your - OpenAPI file. - - Examples: - - ```json Absolute - "openapi": "https://example.com/openapi.json" - ``` - ```json Relative - "openapi": "/openapi.json" - ``` - ```json Multiple - "openapi": ["https://example.com/openapi1.json", "/openapi2.json", "/openapi3.json"] - ``` - - - - - - An object of social media accounts where the key:property pair represents the social media platform and the account url. - - Example: - ```json - { - "x": "https://x.com/mintlify", - "website": "https://mintlify.com" - } - ``` - - - One of the following values `website`, `facebook`, `x`, `discord`, `slack`, `github`, `linkedin`, `instagram`, `hacker-news` - - Example: `x` - - - The URL to the social platform. - - Example: `https://x.com/mintlify` - - - - - - Configurations to enable feedback buttons - - - - Enables a button to allow users to suggest edits via pull requests - - - Enables a button to allow users to raise an issue about the documentation - - - - - - Customize the dark mode toggle. - - - Set if you always want to show light or dark mode for new users. When not - set, we default to the same mode as the user's operating system. - - - Set to true to hide the dark/light mode toggle. You can combine `isHidden` with `default` to force your docs to only use light or dark mode. For example: - - - ```json Only Dark Mode - "modeToggle": { - "default": "dark", - "isHidden": true - } - ``` - - ```json Only Light Mode - "modeToggle": { - "default": "light", - "isHidden": true - } - ``` - - - - - - - - - A background image to be displayed behind every page. See example with - [Infisical](https://infisical.com/docs) and [FRPC](https://frpc.io). - diff --git a/evaluations/custom-evaluator-integration.mdx b/evaluations/custom-evaluator-integration.mdx new file mode 100644 index 000000000..7ef1b34c1 --- /dev/null +++ b/evaluations/custom-evaluator-integration.mdx @@ -0,0 +1,62 @@ +--- +title: Custom Evaluator Integration +--- + +If you have a custom evaluator built in-house which run on your own code, either during the LLM pipeline or after, you can still capture the evaluation results +and connect it back to the trace to visualize it together with the other LangWatch evaluators. + +import PythonCustomEvaluation from "/snippets/python-custom-evaluation.mdx" +import TypeScriptCustomEvaluation from "/snippets/typescript-custom-evaluation.mdx" + + + + +You can capture the evaluation results of your custom evaluator on the current trace or span by using the `.add_evaluation` method: + + + + + + +You can capture the evaluation results of your custom evaluator on the current trace or span by using the `.addEvaluation` method: + + + + + + +## REST API Specification + +### Endpoint + +`POST /api/collector` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the evaluation was run on", + "evaluations": [{ + "evaluation_id": "evaluation-id-123", // optional unique id for identifying the evaluation, if not provided, a random id will be generated + "name": "custom evaluation", // required + "passed": true, // optional + "score": 0.5, // optional + "label": "category_detected", // optional + "details": "explanation of the evaluation results", // optional + "error": { // optional to capture error details in case evaluation had an error + "message": "error message", + "stacktrace": [], + }, + "timestamps": { // optional + "created_at": "1723411698506", // unix timestamp in milliseconds + "updated_at": "1723411698506" // unix timestamp in milliseconds + } + }] +} +``` + + \ No newline at end of file diff --git a/evaluations/overview.mdx b/evaluations/overview.mdx new file mode 100644 index 000000000..2ed1f9a90 --- /dev/null +++ b/evaluations/overview.mdx @@ -0,0 +1,26 @@ +--- +title: Evaluations +--- + +LangWatch offers an extensive library of evaluators to help you evaluate the quality and guarantee the safety of your LLM apps. +Those are very easy to set up on [LangWatch dashboard](https://app.langwatch.com/). + +![Evaluators](/images/screenshot-evaluators.png) + +## Evaluators List + +import EvaluatorsList from "/snippets/evaluators-list.mdx" + + + +## Custom Evaluator Integration + +If you have a custom evaluator built in-house, you can follow the guide below to integrate. + + + + diff --git a/favicon.svg b/favicon.svg index 6a3233265..51069fa13 100644 --- a/favicon.svg +++ b/favicon.svg @@ -1,49 +1,30 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/features/annotations.mdx b/features/annotations.mdx new file mode 100644 index 000000000..23bd4235a --- /dev/null +++ b/features/annotations.mdx @@ -0,0 +1,102 @@ +--- +title: Annotations +--- + +## Create annotations on messages + +With annotations, you can add additional information to messages. This can be useful to comment on or add any other information that you want to add to a message for further analysis. + +We have also implemented the option to add a scoring system for each annotation, more information about this can be found in the [Annotation Scoring](/features/annotations#annotation-scoring) section + +If you want to add an annotation to a queue, you can do so by clicking on the add to queue button to send the messages to the queue for later analysis. You can create queues and add members to them on the the main annotations page. More information about this can be found in the [Annotation Queues](/features/annotations#annotation-queues) section. + + + +#### Usage + +To create an annotation, follow these steps: + +1) Click the message you want to annotate on and a [Trace](/concepts#traces) details drawer will open. +2) On the top right, click the annotation button. +3) Here you will be able to add a comment, a link or any other information that you want to add to the message. + + +LangWatch + + +Once you have created an annotation, you will see it next to to the message. + + +LangWatch + + +# Annotation Scoring + +We have developed a customized scoring system for each annotation. To get started, you will need to create your scores on the settings page. + +There are two types of score data you can choose from: + +- **Checkbox**: To add multiple selectable options. +- **Multiple Choice**: To add a single selectable option. + + +LangWatch + +After you have created your scores, you can activate or deactivate them on the settings page. + +LangWatch + +Once your scores are activated, you will see them in the annotations tab. For each annotation you create, the score options will be available, allowing you to add more detailed information to your annotations. +When annotating a message, you will see the score options below the comment input. Once you have added a score, you will be asked for an optional reason for the score. + +
+ LangWatch + LangWatch +
+ +Thats it! You can now annotate messages and add your custom score metrics to them. + + + +# Annotation Queues + +To get started with annotation queues, follow these steps: + +1) Go to the annotations page. +2) Click the plus button to create a new queue. +3) Add a name for your queue, description, members and click on the "Save" button. + +LangWatch + +Once you have created your queue, you will be able to select this when creating an annotation and send the messages to the queue or directly to a project member for later analysis. + +LangWatch + +Once you add an item to the queue, you can view it in the annotations section, whether it's in a queue or sent directly to you. + +LangWatch + +When clicking on a queue item, you will be directed to the message where you can add an annotation. Once happy with your annotation, you can click on the "Done" button and move on to the next item. + +LangWatch + +Once youโ€™ve completed the final item in the queue, youโ€™ll see that all tasks are done. Thatโ€™s it! Happy annotating! + +LangWatch + + + + + + + + + + + + + + + + + diff --git a/features/batch-evaluations.mdx b/features/batch-evaluations.mdx new file mode 100644 index 000000000..52513e015 --- /dev/null +++ b/features/batch-evaluations.mdx @@ -0,0 +1,43 @@ +--- +title: Batch Evaluations +--- + +If you intend to conduct batch evaluations on the datasets you've created in LangWatch, we offer a Python SKD to facilitate this process. This guide aims to provide comprehensive instructions on leveraging our Python SDK to execute batch evaluations effectively. + +### Usage + +After adding records to your dataset, created within the dataset section of LangWatch, you can proceed to select the dataset for batch evaluation along with the desired evaluations. You have the option to choose from predefined evaluations or any custom evaluations you've set up in the Evaluation and Guardrails section of LangWatch. + +### Screenshots examples + +In the below in screenshot you will see the datasets section in LangWatch, you can get your batch evaluation python snippet by clicking on on the Batch Evaluation button. + +LangWatch + +In the below screenshot you will see where you can select the dataset you want to evaluate on as well as selecting which evaluations you would like to run. Each tab has different evaluation you can choose from. + +LangWatch + +In the screenshot below, you'll find a Python code snippet ready for execution to perform your batch processing. The parameters passed into the `BatchEvaluation` include your chosen dataset and an array of selected evaluations to run against it. + +LangWatch + +We've streamlined the process by setting up pandas for you, enabling seamless evaluation of datasets directly on the results object. This means you can leverage the power of pandas' data manipulation and analysis capabilities effortlessly within your evaluation workflow. With pandas at your disposal, you can efficiently explore, analyze, and manipulate your data to derive valuable insights without the need for additional setup or configuration. + +### Python snippet + +When executing the snippet, you'll encounter a callback function at your disposal. This function contains the original entry data, allowing you to run it against your own Large Language Model (LLM). You can utilize this response to compare results within your evaluation process. + +Ensure that you return the `output` as some evaluations may require it. As you create your code snippet in the evaluations tab, you'll notice indications of which evaluations necessitate particular information. Utilize this guidance as a reference to kickstart your workflow effectively. diff --git a/features/datasets.mdx b/features/datasets.mdx new file mode 100644 index 000000000..c8daee73f --- /dev/null +++ b/features/datasets.mdx @@ -0,0 +1,68 @@ +--- +title: Datasets +--- + +## Create datasets + +LangWatch offers you the possibility to create datasets on your LLM messages. These datasets can be used to train your own models or to do further analysis on the data. +We offer the possibility to create datasets with the following data types; + +- **Input**: The message input string. +- **Expected Output**: The gold-standard expected output for the given input, + useful for output-comparison metrics +- **Contexts**: The contexts provided if your are doing RAG, useful + for RAG-metric evaluations +- **[Spans](/concepts#spans)**: A JSON with all the spans contained in the message + trace, that is, all the steps in your pipeline, for + more complex evaluations +- **LLM Input**: The input the LLM received, in LLM chat history json + format +- **Expected LLM Output**: The gold-standard expected output for the given input, + in LLM chat history json format. +- **Annotation Scores**: The scores of the annotations, useful for annotation-comparison metrics +- **Evaluation Metrics**: The evaluation metrics for the dataset, useful for evaluation-comparison metrics + +#### Usage + +To create a dataset, simply go to the datasets page and click the "Create New Dataset" button. You will be able to select the type of dataset you want as well as the columns you want to include. + +LangWatch + +There are a couple ways to add data to a dataset; + +- **Manually**: You can add data on a per message basis. +- **Group selection**: You can fill the dataset by selecting a group of messages. +- **CSV Upload**: You can fill the dataset by uploading a CSV file. + +### Manually + +To add data manually, click the "Add to Dataset" button on the messages page after selecting a message. You will then be able to choose the dataset type and preview the data that will be added. + +LangWatch + +### Group selection + +To add data by selecting a group, simply click the "Add to Dataset" button after choosing the desired messages in the table view. You'll then be able to select the type of dataset you wish to add to and preview the data that will be included. + +LangWatch +### CSV Upload + +To add data by CSV upload, go to your datasets page and select the dataset you want to update. Click the "Upload CSV" button and upload your CSV file. You can then map the columns from your CSV file to the appropriate fields in the dataset based on the dataset type. + +LangWatch diff --git a/features/embedded-analytics.mdx b/features/embedded-analytics.mdx new file mode 100644 index 000000000..22a9aca3a --- /dev/null +++ b/features/embedded-analytics.mdx @@ -0,0 +1,62 @@ +--- +title: Embedded Analytics +--- + +## Export Analytics with REST Endpoint + +LangWatch offers you the possibility to build and integrate LangWatch graph's on your own systems and applications, to display it to your customers in another interface. + +On LangWatch dashboard, you can use our powerful custom chart builder tool, to plot any data collected and generated by LangWatch, and customize the way you want to display it. You can then use our REST API to fetch the graph data. + +**Usage:** +You will need to obtain your JSON payload from the custom graph section in our application. You can find this on the Analytics page > Custom Reports > Add chart. + + 1. Pick the custom graph you want to get the analytics for. + 2. Prepare your JSON data. Make sure it's is the same format that is showing in the LangWatch application. + 3. Use the `curl` command to get you analytics data. Here is a basic template: + +```bash +# Set your API key and endpoint URL +API_KEY="your_langwatch_api_key" +ENDPOINT="https://app.langwatch.ai/api/analytics" + +# Use curl to send the POST request, e.g.: +curl -X POST "$ENDPOINT" \ + -H "X-Auth-Token: $API_KEY" \ + -H "Content-Type: application/json" \ + -d @- < + +Within this modal, you'll find the JSON payload required for the precise custom analytics +data. Simply copy this payload and paste it into the body of your REST POST request. + +LangWatch +Now you're fully prepared to access your customized analytics and seamlessly integrate +them into your specific use cases. + +If you encounter any hurdles or have questions, our support team is eager to assist you. diff --git a/features/triggers.mdx b/features/triggers.mdx new file mode 100644 index 000000000..1a3c18dd3 --- /dev/null +++ b/features/triggers.mdx @@ -0,0 +1,38 @@ +--- +title: Triggers +--- + +## Create triggers based on LangWatch filters + +LangWatch offers you the possibility to create triggers based on your selected filters. You can use these triggers to send notifications to either Slack or selected team email adresses. + +#### Usage + +To create a trigger in the LangWatch dashboard, follow these steps: + +- Click the filter button located at the top right of the LangWatch dashboard. +- After creating a filter, a trigger button will appear. +- Click the trigger button to open a popout drawer. +- In the drawer, you can configure your trigger with the desired settings. + +LangWatch +**Trigger actions** +LangWatch + +Once the trigger is created, you will receive an alert whenever a message meets the criteria of the trigger. These trigger checks are run on the minute but not instantaneously, as the data needs time to be processed. You can find the created triggers under the Settings section, where you can deactivate or delete a trigger to stop receiving notifications. + +**Trigger settings** + +LangWatch diff --git a/guardrails/overview.mdx b/guardrails/overview.mdx new file mode 100644 index 000000000..5d329e80a --- /dev/null +++ b/guardrails/overview.mdx @@ -0,0 +1,13 @@ +--- +title: Overview +--- + +Learn how you can protect your LLM application from costly mistakes by setting up guardrails. + + + + diff --git a/guardrails/setting-up-guardrails.mdx b/guardrails/setting-up-guardrails.mdx new file mode 100644 index 000000000..2395f8a33 --- /dev/null +++ b/guardrails/setting-up-guardrails.mdx @@ -0,0 +1,27 @@ +--- +title: Setting Up Guardrails +--- + +Guardrails are protections you can add around your LLM calls, either before calling the LLM, for example to prevent jailbreaking; after calling an LLM, for example to verify if the generated output does not contain toxic language or leaking PII; or to steer the LLM in a different direction, for example when detecting a user is going off-topic or talking about competition, in which you might want to throw them in a different flow. + +Setting up Guardrails is quite easy, first, go to the Evaluation and Guardrails area on your [LangWatch dashboard](https://app.langwatch.ai), press + Add, and look for evaluators with the shield icon, those evaluators are the ones that support acting as Guardrails: + +Guardrails + +Then, change the Execution Mode to "As a Guardrail", on the page itself, you will see the instructions on how to integrate the guardrail to your code, after following the instructions, don't forget to click "Save" to create the Guardrail before trying it out. + +Guardrails + +Back to the Guardrail setup, you can also try it out on the messages already on LangWatch, to verify if the Guardrail is working well, of it some adjustments are needed, using the Try it out section: + +Guardrails + +You are now ready to keep your LLM protected and steer the conversation in the right direction with LangWatch Guardrails! Follow the next guides for examples on how to use Guardrails for handling different situations, and more advanced use cases. + +## What's next? + +- (In progress) Using guardrails to prevent bad inputs from the LLM +- (In progress) Using guardrails to prevent bad outputs from the LLM to the user +- (In progress) Steering the conversation with another LLM call from the guardrail +- (In progress) Handling multiple guardrail calls in parallel +- (In progress) Speculative execution of the LLM in parallel to the guardrail call diff --git a/images/annotation-add-score.png b/images/annotation-add-score.png new file mode 100644 index 000000000..0c432dabc Binary files /dev/null and b/images/annotation-add-score.png differ diff --git a/images/annotation-add-to-queue.png b/images/annotation-add-to-queue.png new file mode 100644 index 000000000..a09b2124e Binary files /dev/null and b/images/annotation-add-to-queue.png differ diff --git a/images/annotation-add.png b/images/annotation-add.png new file mode 100644 index 000000000..31a77f7f2 Binary files /dev/null and b/images/annotation-add.png differ diff --git a/images/annotation-comment-view.png b/images/annotation-comment-view.png new file mode 100644 index 000000000..5d849e142 Binary files /dev/null and b/images/annotation-comment-view.png differ diff --git a/images/annotation-queue-items-complete.png b/images/annotation-queue-items-complete.png new file mode 100644 index 000000000..a40c06efc Binary files /dev/null and b/images/annotation-queue-items-complete.png differ diff --git a/images/annotation-queue-items.png b/images/annotation-queue-items.png new file mode 100644 index 000000000..61c4d8da3 Binary files /dev/null and b/images/annotation-queue-items.png differ diff --git a/images/annotation-queues.png b/images/annotation-queues.png new file mode 100644 index 000000000..2a0fabf45 Binary files /dev/null and b/images/annotation-queues.png differ diff --git a/images/annotation-score-reason.png b/images/annotation-score-reason.png new file mode 100644 index 000000000..3b0a9693d Binary files /dev/null and b/images/annotation-score-reason.png differ diff --git a/images/annotation-score-selection.png b/images/annotation-score-selection.png new file mode 100644 index 000000000..966b5e930 Binary files /dev/null and b/images/annotation-score-selection.png differ diff --git a/images/annotation-view-scores.png b/images/annotation-view-scores.png new file mode 100644 index 000000000..ac30e758f Binary files /dev/null and b/images/annotation-view-scores.png differ diff --git a/images/annotations-comment.png b/images/annotations-comment.png new file mode 100644 index 000000000..02cc9d432 Binary files /dev/null and b/images/annotations-comment.png differ diff --git a/images/annotations-create-queue.png b/images/annotations-create-queue.png new file mode 100644 index 000000000..164a143f7 Binary files /dev/null and b/images/annotations-create-queue.png differ diff --git a/images/annotations-drawer.png b/images/annotations-drawer.png new file mode 100644 index 000000000..9bb093a9c Binary files /dev/null and b/images/annotations-drawer.png differ diff --git a/images/annotations-messages.png b/images/annotations-messages.png new file mode 100644 index 000000000..4f2865c8b Binary files /dev/null and b/images/annotations-messages.png differ diff --git a/images/annotations-tab.png b/images/annotations-tab.png new file mode 100644 index 000000000..ed484d33a Binary files /dev/null and b/images/annotations-tab.png differ diff --git a/images/annotations-trace.png b/images/annotations-trace.png new file mode 100644 index 000000000..3f73dc355 Binary files /dev/null and b/images/annotations-trace.png differ diff --git a/images/checks-passed.png b/images/checks-passed.png index 3303c7736..c229672c0 100644 Binary files a/images/checks-passed.png and b/images/checks-passed.png differ diff --git a/images/custom-events.png b/images/custom-events.png new file mode 100644 index 000000000..2fdba0372 Binary files /dev/null and b/images/custom-events.png differ diff --git a/images/dataset-screenshot-add.png b/images/dataset-screenshot-add.png new file mode 100644 index 000000000..d46b28932 Binary files /dev/null and b/images/dataset-screenshot-add.png differ diff --git a/images/dataset-screenshot-csv.png b/images/dataset-screenshot-csv.png new file mode 100644 index 000000000..697ab9c61 Binary files /dev/null and b/images/dataset-screenshot-csv.png differ diff --git a/images/dataset-screenshot-group.png b/images/dataset-screenshot-group.png new file mode 100644 index 000000000..a11fb603f Binary files /dev/null and b/images/dataset-screenshot-group.png differ diff --git a/images/dataset-screenshot-single.png b/images/dataset-screenshot-single.png new file mode 100644 index 000000000..9f1e28f2f Binary files /dev/null and b/images/dataset-screenshot-single.png differ diff --git a/images/dspy-visualizer.png b/images/dspy-visualizer.png new file mode 100644 index 000000000..6eb1c15f7 Binary files /dev/null and b/images/dspy-visualizer.png differ diff --git a/images/enable-guardrails.png b/images/enable-guardrails.png new file mode 100644 index 000000000..6f0ae6cd0 Binary files /dev/null and b/images/enable-guardrails.png differ diff --git a/images/favicon.ico b/images/favicon.ico new file mode 100644 index 000000000..33f5c9f00 Binary files /dev/null and b/images/favicon.ico differ diff --git a/images/guardrails-try-it-out.png b/images/guardrails-try-it-out.png new file mode 100644 index 000000000..712ebbf07 Binary files /dev/null and b/images/guardrails-try-it-out.png differ diff --git a/images/guardrails.png b/images/guardrails.png new file mode 100644 index 000000000..c099b8fb5 Binary files /dev/null and b/images/guardrails.png differ diff --git a/images/how-to-choose-enterprise.jpg b/images/how-to-choose-enterprise.jpg new file mode 100644 index 000000000..f0d996aef Binary files /dev/null and b/images/how-to-choose-enterprise.jpg differ diff --git a/images/how-to-choose-ragas.png b/images/how-to-choose-ragas.png new file mode 100644 index 000000000..03190fb37 Binary files /dev/null and b/images/how-to-choose-ragas.png differ diff --git a/images/how-to-choose-safeguards.jpg b/images/how-to-choose-safeguards.jpg new file mode 100644 index 000000000..22d10135b Binary files /dev/null and b/images/how-to-choose-safeguards.jpg differ diff --git a/images/integration/azure.png b/images/integration/azure.png new file mode 100644 index 000000000..1b6e5bd4b Binary files /dev/null and b/images/integration/azure.png differ diff --git a/images/integration/dspy.png b/images/integration/dspy.png new file mode 100644 index 000000000..d490b9d68 Binary files /dev/null and b/images/integration/dspy.png differ diff --git a/images/integration/flowise/flowise-1.png b/images/integration/flowise/flowise-1.png new file mode 100644 index 000000000..ceb0c17fa Binary files /dev/null and b/images/integration/flowise/flowise-1.png differ diff --git a/images/integration/flowise/flowise-2.png b/images/integration/flowise/flowise-2.png new file mode 100644 index 000000000..650167e36 Binary files /dev/null and b/images/integration/flowise/flowise-2.png differ diff --git a/images/integration/flowise/flowise-3.png b/images/integration/flowise/flowise-3.png new file mode 100644 index 000000000..b42a2394f Binary files /dev/null and b/images/integration/flowise/flowise-3.png differ diff --git a/images/integration/langchain-rag.png b/images/integration/langchain-rag.png new file mode 100644 index 000000000..0aad6d93d Binary files /dev/null and b/images/integration/langchain-rag.png differ diff --git a/images/integration/langchain.png b/images/integration/langchain.png new file mode 100644 index 000000000..99e2199c3 Binary files /dev/null and b/images/integration/langchain.png differ diff --git a/images/integration/langflow/langflow-1.png b/images/integration/langflow/langflow-1.png new file mode 100644 index 000000000..50f17c9fa Binary files /dev/null and b/images/integration/langflow/langflow-1.png differ diff --git a/images/integration/langflow/langflow-2.png b/images/integration/langflow/langflow-2.png new file mode 100644 index 000000000..8a0c33aa7 Binary files /dev/null and b/images/integration/langflow/langflow-2.png differ diff --git a/images/integration/langflow/langflow-code.png b/images/integration/langflow/langflow-code.png new file mode 100644 index 000000000..bc4a7fbf3 Binary files /dev/null and b/images/integration/langflow/langflow-code.png differ diff --git a/images/integration/langflow/langflow-langwatch-call.png b/images/integration/langflow/langflow-langwatch-call.png new file mode 100644 index 000000000..b0b4efcfa Binary files /dev/null and b/images/integration/langflow/langflow-langwatch-call.png differ diff --git a/images/integration/langflow/langwatch-message.png b/images/integration/langflow/langwatch-message.png new file mode 100644 index 000000000..8f61dd694 Binary files /dev/null and b/images/integration/langflow/langwatch-message.png differ diff --git a/images/integration/litellm.png b/images/integration/litellm.png new file mode 100644 index 000000000..bd2242177 Binary files /dev/null and b/images/integration/litellm.png differ diff --git a/images/integration/message-custom-input-output.png b/images/integration/message-custom-input-output.png new file mode 100644 index 000000000..4503e71bb Binary files /dev/null and b/images/integration/message-custom-input-output.png differ diff --git a/images/integration/message-raw-input-output.png b/images/integration/message-raw-input-output.png new file mode 100644 index 000000000..781148f96 Binary files /dev/null and b/images/integration/message-raw-input-output.png differ diff --git a/images/integration/openai.png b/images/integration/openai.png new file mode 100644 index 000000000..6ca1abcf7 Binary files /dev/null and b/images/integration/openai.png differ diff --git a/images/integration/opentelemetry/openinference-dspy.png b/images/integration/opentelemetry/openinference-dspy.png new file mode 100644 index 000000000..5458501ed Binary files /dev/null and b/images/integration/opentelemetry/openinference-dspy.png differ diff --git a/images/integration/opentelemetry/openinference-haystack.png b/images/integration/opentelemetry/openinference-haystack.png new file mode 100644 index 000000000..a14da21f6 Binary files /dev/null and b/images/integration/opentelemetry/openinference-haystack.png differ diff --git a/images/integration/opentelemetry/openinference-langchain.png b/images/integration/opentelemetry/openinference-langchain.png new file mode 100644 index 000000000..9ce157737 Binary files /dev/null and b/images/integration/opentelemetry/openinference-langchain.png differ diff --git a/images/integration/opentelemetry/openinference-openai.png b/images/integration/opentelemetry/openinference-openai.png new file mode 100644 index 000000000..93955e9f0 Binary files /dev/null and b/images/integration/opentelemetry/openinference-openai.png differ diff --git a/images/integration/opentelemetry/openllmetry-anthropic.png b/images/integration/opentelemetry/openllmetry-anthropic.png new file mode 100644 index 000000000..96fbc54b1 Binary files /dev/null and b/images/integration/opentelemetry/openllmetry-anthropic.png differ diff --git a/images/integration/opentelemetry/openllmetry-langchain.png b/images/integration/opentelemetry/openllmetry-langchain.png new file mode 100644 index 000000000..a14998456 Binary files /dev/null and b/images/integration/opentelemetry/openllmetry-langchain.png differ diff --git a/images/integration/opentelemetry/openllmetry-openai.png b/images/integration/opentelemetry/openllmetry-openai.png new file mode 100644 index 000000000..ebfda2fe9 Binary files /dev/null and b/images/integration/opentelemetry/openllmetry-openai.png differ diff --git a/images/integration/rag.png b/images/integration/rag.png new file mode 100644 index 000000000..9cdf06283 Binary files /dev/null and b/images/integration/rag.png differ diff --git a/images/integration/vercel-ai-sdk.png b/images/integration/vercel-ai-sdk.png new file mode 100644 index 000000000..6277751c5 Binary files /dev/null and b/images/integration/vercel-ai-sdk.png differ diff --git a/images/langwatch-architecture.png b/images/langwatch-architecture.png new file mode 100644 index 000000000..1fcce2bdc Binary files /dev/null and b/images/langwatch-architecture.png differ diff --git a/images/onprem-clouds.png b/images/onprem-clouds.png new file mode 100644 index 000000000..5b4d62f12 Binary files /dev/null and b/images/onprem-clouds.png differ diff --git a/images/onprem-logo.png b/images/onprem-logo.png new file mode 100644 index 000000000..d1a984680 Binary files /dev/null and b/images/onprem-logo.png differ diff --git a/images/screenshot-batch-evaluation-drawer.png b/images/screenshot-batch-evaluation-drawer.png new file mode 100644 index 000000000..90c5dd542 Binary files /dev/null and b/images/screenshot-batch-evaluation-drawer.png differ diff --git a/images/screenshot-batch-evaluation-python.png b/images/screenshot-batch-evaluation-python.png new file mode 100644 index 000000000..2285ab85a Binary files /dev/null and b/images/screenshot-batch-evaluation-python.png differ diff --git a/images/screenshot-dashboard.png b/images/screenshot-dashboard.png new file mode 100644 index 000000000..5f5551e16 Binary files /dev/null and b/images/screenshot-dashboard.png differ diff --git a/images/screenshot-datasets-page.png b/images/screenshot-datasets-page.png new file mode 100644 index 000000000..74f892b69 Binary files /dev/null and b/images/screenshot-datasets-page.png differ diff --git a/images/screenshot-dspy-llm-calls.png b/images/screenshot-dspy-llm-calls.png new file mode 100644 index 000000000..c9642f35a Binary files /dev/null and b/images/screenshot-dspy-llm-calls.png differ diff --git a/images/screenshot-evaluators.png b/images/screenshot-evaluators.png new file mode 100644 index 000000000..e969bd946 Binary files /dev/null and b/images/screenshot-evaluators.png differ diff --git a/images/screenshot-json-modal.png b/images/screenshot-json-modal.png new file mode 100644 index 000000000..723132b96 Binary files /dev/null and b/images/screenshot-json-modal.png differ diff --git a/images/screenshot-langwatch.png b/images/screenshot-langwatch.png new file mode 100644 index 000000000..864b9c67c Binary files /dev/null and b/images/screenshot-langwatch.png differ diff --git a/images/screenshot-messages.png b/images/screenshot-messages.png new file mode 100644 index 000000000..e4932c3dc Binary files /dev/null and b/images/screenshot-messages.png differ diff --git a/images/screenshot-rag-dspy-tutorial.png b/images/screenshot-rag-dspy-tutorial.png new file mode 100644 index 000000000..4000a1faf Binary files /dev/null and b/images/screenshot-rag-dspy-tutorial.png differ diff --git a/images/screenshot-show-json.png b/images/screenshot-show-json.png new file mode 100644 index 000000000..de6c58262 Binary files /dev/null and b/images/screenshot-show-json.png differ diff --git a/images/trigger-screenshot-button.png b/images/trigger-screenshot-button.png new file mode 100644 index 000000000..7d513a0a4 Binary files /dev/null and b/images/trigger-screenshot-button.png differ diff --git a/images/trigger-screenshot-drawer.png b/images/trigger-screenshot-drawer.png new file mode 100644 index 000000000..a03b24ee9 Binary files /dev/null and b/images/trigger-screenshot-drawer.png differ diff --git a/images/trigger-screenshot-settings.png b/images/trigger-screenshot-settings.png new file mode 100644 index 000000000..b96837dfd Binary files /dev/null and b/images/trigger-screenshot-settings.png differ diff --git a/integration/cookbooks.mdx b/integration/cookbooks.mdx new file mode 100644 index 000000000..47cb85854 --- /dev/null +++ b/integration/cookbooks.mdx @@ -0,0 +1,44 @@ +Below are some examples for integrating LangWatch into your project. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## OpenTelemetry + + + + + + + + + + + \ No newline at end of file diff --git a/integration/flowise.mdx b/integration/flowise.mdx new file mode 100644 index 000000000..689179950 --- /dev/null +++ b/integration/flowise.mdx @@ -0,0 +1,26 @@ +--- +title: Flowise Integration +--- + +[Flowise](https://flowiseai.com/) is a low-code tool for building LLM pipelines. If you are using Flowise, you can easily enable LangWatch from their UI for analytics, evaluations and much more. + + + +[Create your LangWatch account](https://app.langwatch.ai/) and project to obtain your API Key from the dashboard + + +At the top right corner of your Chatflow or Agentflow, click Settings > Configuration +![Flowise settings](/images/integration/flowise/flowise-1.png) + + +![Flowise analytics](/images/integration/flowise/flowise-2.png) + + +![Flowise add integration](/images/integration/flowise/flowise-3.png) + + +That's it! Now simply send a message to your agent or chatflow to see it on LangWatch and start monitoring + + + +For more information, check out [Flowise docs](https://docs.flowiseai.com/using-flowise/analytic). \ No newline at end of file diff --git a/integration/langflow.mdx b/integration/langflow.mdx new file mode 100644 index 000000000..af828fd89 --- /dev/null +++ b/integration/langflow.mdx @@ -0,0 +1,81 @@ +--- +title: Langflow Integration +--- + +[Langflow](https://www.langflow.org/) is a low-code tool for building LLM pipelines. If you are using Langflow, you can easily enable LangWatch from their UI for analytics, evaluations and much more. + +## Setup + + + +[Create your LangWatch account](https://app.langwatch.com/) and project to obtain your API Key from the dashboard + + +Add the following key to Langflow .env file: +```bash +LANGWATCH_API_KEY="your-api-key" +``` +Or export in in your terminal: +```bash +export LANGWATCH_API_KEY="your-api-key" +``` + + +Restart Langflow using `langflow run --env-file .env` + + +Run a message through your Langflow project and check the LangWatch dashboard for monitoring and observability. + +![Langflow project](/images/integration/langflow/langflow-1.png) + +That's it! You should now see your Langflow component traces on the LangWatch dashboard. + +![LangWatch results](/images/integration/langflow/langflow-2.png) + + + +## Defining custom input and output + +You can customize what LangWatch captures as the final input and output of your Langflow component for better observability. + +To do this, you can add this two lines of code in the execution function of any Langflow component: + +```python +import langwatch +langwatch.get_current_trace().update(input="The user input", output="My bot output") +``` + +You can do this by first clicking on the `<> Code` button in any appropriate component: + +![Langflow code button](/images/integration/langflow/langflow-code.png) + +Then scroll down to find the `def` responsible for execution of that component and paste the code above, mapping the variables as needed for your case: + +![Langflow code editor](/images/integration/langflow/langflow-langwatch-call.png) + +The message on LangWatch will render as you defined: + +![LangWatch message](/images/integration/langflow/langwatch-message.png) + + +## Capturing additional metadata + +You can also capture additional metadata from your Langflow component. This can be useful for capturing information about the user, the conversation, or any specific information from your system. + +Just like for the input and output, you can capture metadata by updating the trace, two very useful cases to capture for example are the user_id and trace_id that groups messages from the same conversation, +but you can also capture any other information that you want to track. + +```python +import langwatch +langwatch.get_current_trace().update( + metadata={ + "user_id": self.sender_name, + "thread_id": self.session_id, + # any other metadata you want + } +) +``` + +--- + +For more information, check out [Langflow docs](https://docs.langflow.org/). \ No newline at end of file diff --git a/integration/mcp.mdx b/integration/mcp.mdx new file mode 100644 index 000000000..bf49a6526 --- /dev/null +++ b/integration/mcp.mdx @@ -0,0 +1,78 @@ +--- +title: LangWatch MCP Server +sidebarTitle: LangWatch MCP +--- + +The [LangWatch MCP Server](https://www.npmjs.com/package/@langwatch/mcp-server) is a tool designed to aid finding, searching, and looking up LLM traces from the LangWatch platform via the [Model Context Protocol](https://modelcontextprotocol.io/introduction). + +This server facilitates LLM development by allowing the agent to search for traces, understand all the steps in between a problematic output and try to fix the issue. + + + +## Setup in your Codebase + +Check out [the integration guides](/integration/overview) to start tracking your agents so both you and Cursor/Windsurf/Claude Code or your favorite coding assistant can debug it. + +## Setup in Cursor ๐Ÿ‘ฉโ€๐Ÿ’ป + +1. Navigate to the Cursor Settings +2. Navigate to the MCP item in the sidebar +3. Set the "name" as "LangWatch" +4. Set the "type" to `command` +5. Set the "command" to `npx -y @langwatch/mcp-server --apiKey=sk-lw-...` +- `--apiKey`: Your LangWatch API key. This is mandatory and must be provided. +- `--endpoint`: *Optional* The endpoint for the LangWatch API. Defaults to `https://app.langwatch.ai` if not specified. + +> [!TIP] +> To aid in securing your keys, the MCP will first look at the global system environment variables `LANGWATCH_API_KEY` and `LANGWATCH_ENDPOINT` to check if they have values as well as looking at arguments passed into the server on start. + +LangWatch MCP Setup + +## Tools + +The MCP Server provides the following tools: + +### `get_latest_traces` + +- **Description:** Retrieves the latest LLM traces. +- **Parameters:** + - `pageOffset` (optional): The page offset for pagination. + - `daysBackToSearch` (optional): The number of days back to search for traces. Defaults to 1. + +### `get_trace_by_id` + +- **Description:** Retrieves a specific LLM trace by its ID. +- **Parameters:** + - `id`: The ID of the trace to retrieve. + +## Usage in Cursor + +To use these tools within Cursor, follow these steps: + +1. **Open the Cursor Chat view:** + - `Cmd + I` + +2. **Ensure the MCP server is running:** + +3. **Interact with your Agent:** + - Ask a question like the following to test the tools are accessible: *Note: When the tool is detected, you'll need to run `Run tool` in the chat view for it to be called. + +> "I just ran into an issue while debugging, can you check the latest traces and fix it?" + +LangWatch MCP Example + + +## ๐Ÿ›Ÿ Support + +If you have questions or need help, join our community: + +- [Discord Community](https://discord.gg/kT4PhDS2gH) +- [Email Support](mailto:support@langwatch.ai) diff --git a/integration/opentelemetry/guide.mdx b/integration/opentelemetry/guide.mdx new file mode 100644 index 000000000..ca3de89fc --- /dev/null +++ b/integration/opentelemetry/guide.mdx @@ -0,0 +1,284 @@ +--- +title: OpenTelemetry Integration Guide +sidebarTitle: Guide +--- + +import OpenInferenceMetadata from "/snippets/openinference-metadata.mdx"; + +OpenTelemetry is a standard protocol for tracing, and LangWatch is fully compatible with OpenTelemetry, you can use any OpenTelemetry compatible library to capture your LLM traces and send them to LangWatch. + +This guide demonstrates the OpenTelemetry integration using Python, but the same principles apply to integration with OpenTelemetry instrumentation in other languages. + +import LLMsTxtProtip from "/snippets/llms-txt-protip.mdx"; + + + +#### Prerequisites + +- Obtain your `LANGWATCH_API_KEY` from the [LangWatch dashboard](https://app.langwatch.com/). + +#### Installation + +```bash +pip install opentelemetry +``` + +#### Configuration + +Set up LangWatch as the OpenTelemetry exporter endpoint: + +```python +import os +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk import trace as trace_sdk +from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor + +# Set up OpenTelemetry trace provider with LangWatch as the endpoint +tracer_provider = trace_sdk.TracerProvider() +tracer_provider.add_span_processor( + SimpleSpanProcessor( + OTLPSpanExporter( + endpoint="https://app.langwatch.ai/api/otel/v1/traces", + headers={"Authorization": "Bearer " + os.environ["LANGWATCH_API_KEY"]}, + ) + ) +) +# Optionally, you can also print the spans to the console. +tracer_provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter())) +``` + +## Capturing LLM Traces + +Currently, there are different open initiatives for LLM instrumentation libraries, here we show some examples on how to capture LLM traces with a couple of them. + + + + + + Installation: + + ```bash + pip install openinference-instrumentation-openai + ``` + + Then, instrument your OpenAI calls: + + ```python + from openinference.instrumentation.openai import OpenAIInstrumentor + + OpenAIInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your OpenAI calls in the LangWatch dashboard: + + ![OpenInference OpenAI Spans](/images/integration/opentelemetry/openinference-openai.png) + + + + + Installation: + + ```bash + pip install openinference-instrumentation-bedrock + ``` + + Then, instrument your AWS calls: + + ```python + from openinference.instrumentation.bedrock import BedrockInstrumentor + + BedrockInstrumentor().instrument() + ``` + + That's it! You can now see the traces for your AWS Bedrock calls in the LangWatch dashboard. + + + + + Installation: + + ```bash + pip install openinference-instrumentation-dspy + ``` + + Then, instrument your DSPy calls: + + ```python + from openinference.instrumentation.dspy import DSPyInstrumentor + + DSPyInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your DSPy calls in the LangWatch dashboard: + + ![OpenInference DSPy Spans](/images/integration/opentelemetry/openinference-dspy.png) + + + + + Installation: + + ```bash + pip install openinference-instrumentation-haystack + ``` + + Then, instrument your Haystack calls: + + ```python + from openinference.instrumentation.haystack import HaystackInstrumentor + + HaystackInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your Haystack calls in the LangWatch dashboard: + + ![OpenInference Haystack Spans](/images/integration/opentelemetry/openinference-haystack.png) + + + + + Installation: + + ```bash + pip install openinference-instrumentation-langchain + ``` + + Then, instrument your LangChain calls: + + ```python + from openinference.instrumentation.langchain import LangChainInstrumentor + + LangChainInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your LangChain calls in the LangWatch dashboard: + + ![OpenInference LangChain Spans](/images/integration/opentelemetry/openinference-langchain.png) + + + + + Installation: + + ```bash + pip install openinference-instrumentation-crewai openinference-instrumentation-langchain + ``` + + Then, instrument your LangChain calls. CrewAI uses LangChain under the hood, so we instrument both: + + ```python + from openinference.instrumentation.crewai import CrewAIInstrumentor + from openinference.instrumentation.langchain import LangChainInstrumentor + + CrewAIInstrumentor().instrument(tracer_provider=tracer_provider) + LangChainInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your CrewAI calls in the LangWatch dashboard. + + + + + Installation: + + ```bash + pip install openinference-instrumentation-autogen + ``` + + **Note:** The Autogen integration is currently experimental and may have limitations or unexpected behavior. + + Then, instrument Autogen: + + ```python + from openinference.instrumentation.autogen import AutogenInstrumentor + + AutogenInstrumentor().instrument() + ``` + + That's it! You can now see the traces from inside Autogen in the LangWatch dashboard. + + + + + + + + + + Installation: + + ```bash + pip install opentelemetry-instrumentation-openai + ``` + + Then, instrument your OpenAI calls: + + ```python + from opentelemetry.instrumentation.openai import OpenAIInstrumentor + + OpenAIInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your OpenAI calls in the LangWatch dashboard: + + ![OpenLLMetry OpenAI Spans](/images/integration/opentelemetry/openllmetry-openai.png) + + + Installation: + + ```bash + pip install opentelemetry-instrumentation-anthropic + ``` + + Then, instrument your Anthropic calls: + + ```python + from opentelemetry.instrumentation.anthropic import AnthropicInstrumentor + + AnthropicInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your Anthropic calls in the LangWatch dashboard: + + ![OpenLLMetry Anthropic Spans](/images/integration/opentelemetry/openllmetry-anthropic.png) + + + Installation: + + ```bash + pip install opentelemetry-instrumentation-langchain + ``` + + Then, instrument your LangChain calls: + + ```python + from opentelemetry.instrumentation.langchain import LangchainInstrumentor + + LangchainInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your LangChain calls in the LangWatch dashboard: + + ![OpenLLMetry LangChain Spans](/images/integration/opentelemetry/openllmetry-langchain.png) + + + Installation: + + ```bash + pip install opentelemetry-instrumentation-llamaindex + ``` + + Then, instrument your LlamaIndex calls: + + ```python + from opentelemetry.instrumentation.llamaindex import LlamaIndexInstrumentor + + LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider) + ``` + + That's it! You can now see the traces for your LlamaIndex calls in the LangWatch dashboard. + + + + + diff --git a/integration/overview.mdx b/integration/overview.mdx new file mode 100644 index 000000000..3a5795261 --- /dev/null +++ b/integration/overview.mdx @@ -0,0 +1,7 @@ +Integrating LangWatch into your projects is designed to be a straightforward process. Regardless of the language or LLM model you are using, you can set up LangWatch with minimal configuration and start gathering valuable insights into your LLM's performance and user interactions. + + + + + + diff --git a/integration/python/guide.mdx b/integration/python/guide.mdx new file mode 100644 index 000000000..3beb9ac01 --- /dev/null +++ b/integration/python/guide.mdx @@ -0,0 +1,317 @@ +--- +title: Python Integration Guide +sidebarTitle: Guide +--- + +
+
+ + LangWatch Python Repo + +
+ +
+ + LangWatch Python SDK version + +
+
+ +LangWatch library is the easiest way to integrate your Python application with LangWatch, the messages are synced on the background so it doesn't intercept or block your LLM calls. + +import LLMsTxtProtip from "/snippets/llms-txt-protip.mdx"; + + + +import Prerequisites from "/snippets/prerequests.mdx"; +import PythonRAGSpan from "/snippets/python-rag-span.mdx"; +import PythonLangChainRAG from "/snippets/python-langchain-rag.mdx"; + + + + +## Capturing Messages + +- Each message triggering your LLM pipeline as a whole is captured with a [Trace](/concepts#traces). +- A [Trace](/concepts#traces) contains multiple [Spans](/concepts#spans), which are the steps inside your pipeline. + - A span can be an LLM call, a database query for a RAG retrieval, or a simple function transformation. + - Different types of [Spans](/concepts#spans) capture different parameters. + - [Spans](/concepts#spans) can be nested to capture the pipeline structure. +- [Traces](/concepts#traces) can be grouped together on LangWatch Dashboard by having the same [`thread_id`](/concepts#threads) in their metadata, making the individual messages become part of a conversation. + - It is also recommended to provide the [`user_id`](/concepts#user-id) metadata to track user analytics. + +## Create a Trace + +To capture traces and spans, start by adding the `@langwatch.trace()` decorator to the function that starts your LLM pipeline. Here it is represented by the `main()` function, but it can be your endpoint call or your class method that triggers the whole generation. + +```python +import langwatch + +@langwatch.trace() +def main(): + ... +``` + +This is the main entry point for your trace, and all spans called from here will be collected automatically to LangWatch in the background. + + +On short-live environments like Lambdas or Serverless Functions, be sure to call
`langwatch.get_current_trace().send_spans()` before your trace function ends to wait for all pending requests to be sent before the runtime is destroyed. +
+ +## Capturing LLM Spans + +LangWatch provides some utilities to automatically capture spans for popular LLM frameworks. + + + +For OpenAI, you can use the `autotrack_openai_calls()` function to automatically capture LLM spans for OpenAI calls for the current trace. + +```python +import langwatch +from openai import OpenAI + +client = OpenAI() + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_openai_calls(client) + ... +``` + +That's enough to have your OpenAI calls collected and visible on LangWatch dashboard: + +![OpenAI Spans](/images/integration/openai.png) + + +For Azure OpenAI, you can use the `autotrack_openai_calls()` function to automatically capture LLM spans for Azure OpenAI calls for the current trace. + +```python +import langwatch +from openai import AzureOpenAI + +client = AzureOpenAI() + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_openai_calls(client) + ... +``` + +That's enough to have your Azure OpenAI calls collected and visible on LangWatch dashboard: + +![Azure OpenAI Spans](/images/integration/azure.png) + + +You can use [LiteLLM](https://github.com/BerriAI/litellm) to call OpenAI, Anthropic, Gemini, Groq Llama 3 and over 100+ LLM models. + +And for tracking it all with LangWatch, you can use the `autotrack_litellm_calls()` function to automatically capture LLM spans for LiteLLM calls for the current trace. + +```python +import langwatch +import litellm + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_litellm_calls(litellm) + + response = litellm.completion( + ... + ) +``` + + +Since we patch the `completion` method of the `litellm` module, you must use `litellm.completion()` instead of just `completion()` when calling your LLM, otherwise LangWatch will not be able to capture the spans. + + +That's enough to have your LiteLLM calls collected and visible on LangWatch dashboard: + +![LiteLLM Spans](/images/integration/litellm.png) + + +[DSPy](https://github.com/stanfordnlp/dspy) is the LLM framework that automatically optimizes prompts, you can use LangWatch both for [visualizing](/dspy-visualization/quickstart) the +optimization process, and for tracking the calls during inference as this guide shows. + +To track DSPy programs, you can use the `autotrack_dspy()` function to automatically capture DSPy modules forward pass, retrievers and LLM calls for the current trace. + +```python +import langwatch +import dspy + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_dspy() + + program = MyDspyProgram() + response = program( + ... + ) +``` + +That's enough to have your DSPy traces collected and visible on LangWatch dashboard: + +![DSPy Spans](/images/integration/dspy.png) + + +For LangChain, you can automatically capture every step of your chain as a span by getting a LangChain callback for the current trace with `get_langchain_callback()`. + +```python +import langwatch + +@langwatch.trace() +def main(): + ... + chain.invoke( + {"input": user_input}, + # Add the LangWatch callback when invoking your chain + {"callbacks": [langwatch.get_current_trace().get_langchain_callback()]}, + ) +``` + +That's enough to have your LangChain calls collected and visible on LangWatch dashboard: + +![LangChain Spans](/images/integration/langchain.png) + + + +Check out for more python integration examples on the [examples folder on our GitHub repo](https://github.com/langwatch/langwatch/tree/main/python-sdk/examples). + +## Adding metadata + +You can add metadata to track the user_id and current conversation thread_id, this is highly recommended to unlock better conversation grouping and user analytics on LangWatch. + +```python +import langwatch + +@langwatch.trace() +def main(): + langwatch.get_current_trace().update(metadata={"user_id": "user_id", "thread_id": "thread_id"}) + ... +``` + +You can also add custom labels to your trace to help you better filter and group your traces, or even trigger specific evaluations and alerts. + +```python +import langwatch + +@langwatch.trace() +def main(): + langwatch.get_current_trace().update(metadata={"labels": ["production"]}) + ... +``` + +Check out the [reference](./reference#trace) to see all the available trace properties. + +## Changing the Message Input and Output + +By default, the main input and output of the trace displayed on LangWatch is captured from the arguments and return value of +the top-level decorated function and heuristics try to extract the human-readable message from it automatically. + +However, sometimes more complex structures are used and the messages might not end up very human-readable on LangWatch, for example: + +![Raw Input and Output](/images/integration/message-raw-input-output.png) + +To make the messages really easy to read in the list and through the whole conversation, you can manually set what +should the input and output of the trace be, by calling `.update(input=...)` and `.update(output=...)` on the current trace: + +```python +import langwatch + +@langwatch.trace() +def main(inputs): + # Update the input of the trace with the user message or any other human-readable text + langwatch.get_current_trace().update(input=inputs.question) + + ... + + # Then, before returning, update the output of the trace with final response + langwatch.get_current_trace().update(output=response) + + return response +``` + +This will make the messages on LangWatch look like this: + +![Custom Input and Output](/images/integration/message-custom-input-output.png) + +## Capturing a RAG span + +RAG is a combination of a retrieval and a generation step, LangWatch provides a special span type for RAG that captures both steps separately which allows to capture the `contexts` being used by the LLM on your pipeline. +By capturing the `contexts`, you unlock various uses of it on LangWatch, like RAG evaluators such as Faitfhfulness and Context Relevancy, and analytics on which documents are being used the most. + + + + + + + + + + +## Capturing other spans + +To be able to inspect and debug each step of your pipeline along with the LLM calls, you can use the `@langwatch.span()` decorator. You can pass in different `type`s to categorize your spans. + +```python +import langwatch + +@langwatch.span() +def database_query(): + ... + +@langwatch.span(type="tool") +def weather_forecast(city: str): + ... + +@langwatch.span(type="rag") +def rag_retrieval(): + ... + +# You can manually track llm calls too if the automatic capture is not enough for your use case +@langwatch.span(type="llm") +def llm_call(): + ... + +@langwatch.trace() +def main(): + ... +``` + +The input and output of the decorated function are automatically captured in the span, to disable that, you can set `capture_input` and `capture_output` to `False`: + +```python +@langwatch.span(capture_input=False, capture_output=False) +def database_query(): + ... +``` + +You can also modify the current spans attributes, either on the decorator by using `.update()` on the current span: + +```python +@langwatch.span(type="llm", name="custom_name") +def llm_call(): + langwatch.get_current_span().update(model="my-custom-model") + ... +``` + +Check out the [reference](./reference#span) to see all the available span properties. + +## Capturing custom evaluation results + +[LangWatch Evaluators](/evaluations/overview) can run automatically on your traces, but if you have an in-house custom evaluator, you can also capture the evaluation +results of your custom evaluator on the current trace or span by using the `.add_evaluation` method: + +import PythonCustomEvaluation from "/snippets/python-custom-evaluation.mdx" + + + +## Synchronizing your message IDs with LangWatch traces + +If you store the messages in a database on your side as well, you set the `trace_id` of the current trace to the same one of the message on your side, this way your system will be in sync with LangWatch traces, making it easier to investigate later on. + +```python +@langwatch.trace() +def main(): + ... + langwatch.get_current_trace().update(trace_id=message_id) + ... +``` \ No newline at end of file diff --git a/integration/python/reference.mdx b/integration/python/reference.mdx new file mode 100644 index 000000000..2e50cc8a6 --- /dev/null +++ b/integration/python/reference.mdx @@ -0,0 +1,125 @@ +--- +title: Python SDK Reference +sidebarTitle: Reference +--- + +This page contains the low-level reference for the Python SDK components, for guide on integrating LangWatch into your Python project, see [Python Integration Guide](/integration/python/guide). + +## Trace + +The trace is the basic unit of work in LangWatch. It is a collection of spans that are grouped together to form a single unit of work, you can create a trace in three manners: + +```python +import langwatch + +# As a decorator: +@langwatch.trace() +def my_function(): + pass + + +# As a context manager +with langwatch.trace(): + pass + + +# As a function +trace = langwatch.trace() +``` + +All three ways will create the same trace objects, but for the last one you manually need to call `trace.deferred_send_spans()` or `trace.send_spans()` to send the spans to the LangWatch API. + +The first two will also set the trace to the context, which you can retrieve by: + +``` +trace = langwatch.get_current_trace() +``` + +Both on the trace creation function and `.update()` you can set trace_id, metadata and api_key to be used by the trace. + +| Parameter | Type | Description | +| :-------- | :--- | :---------- | +| trace_id | `str` | The trace id to use for the trace, a random one is generated by default, but you can also pass your own to connect with your internal message id if you have it. | +| metadata | `dict` | The object holding metadata for the trace, it contains a few fields listed below. | +| metadata.user_id | `str` | The user id that is triggering the generation on your LLM pipeline | +| metadata.thread_id | `str` | A thread id can be used to virtually group together all the different traces in a single thread or workflow | +| metadata.labels | `list[str]` | A list of labels to categorize the trace which you can use to filter on later on LangWatch dashboard, trigger evaluations and alerts | +| api_key | `str` | The api key to use for the trace, can be set to override the LANGWATCH_API_KEY environment variable. | + + +## Span + +A Span is a single unit of work in a trace, it is the smallest unit of work in LangWatch. Similar to traces, you can create it in three different manners: + +```python +import langwatch + +# As a decorator +@langwatch.span() +def my_function(): + pass + +# As a context manager +with langwatch.span(): + pass + +# As a function +span = langwatch.span() +``` + +All three ways will create the same span objects, but for the last one you need to manually end the span by calling `span.end()`, which may also take parameters for updating the span data: + +```python +span.end(output="sunny") +``` + +The first two will also set the span to the context, which you can retrieve by: + +``` +span = langwatch.get_current_span() +``` + +By default, when the span is created it becomes the child of the current span in context, but you can also explicitly create a children span from a trace or from another span by initiating them from the parent, for example: + +```python +trace = langwatch.trace() # or langwatch.get_current_trace() + +# Direct child of the trace +span = trace.span(name="child") + +# Child of another span, granchild of the trace +subspan = span.span(name="grandchild") + +subspan.end() +span.end() + +trace.deferred_send_spans() +``` + +Both on the span creation function, `.update()` and `.end()` functions you can set span parameters: + +| Parameter | Type | Description | +| :-------- | :--- | :---------- | +| span_id | `str` | The span id to use for the span, a random one is generated by default. | +| name | `str` | The name of the span, automatically inferred from the function when using the `@langwatch.span()` decorator. | +| type | `"span" \| "rag" \| "llm" \| "chain" \| "tool" \| "agent" \| "guardrail"` | The type of the span, defaults to `span`, with `rag` and `llm` spans allowing for some extra parameters. | +| parent | `ContextSpan` | The parent span to use for the span, if not set, the current span in context is used as the parent. | +| capture_input | `bool` | Available only on the `@langwatch.span()` decorator, whether to capture the input of the function, defaults to `True`. | +| capture_output | `bool` | Available only on the `@langwatch.span()` decorator, whether to capture the output of the function, defaults to `True`. | +| input | `str \| list[ChatMessage] \| SpanInputOutput` | The span input, it can be either a string, or a list of OpenAI-compatible chat messages format dicts, or a `SpanInputOutput` object, which captures other generic types such as `{ "type": "json", "value": {...} }`. | +| output | `str \| list[ChatMessage] \| SpanInputOutput` | The span output, it can be either a string, or a list of OpenAI-compatible chat messages format dicts, or a `SpanInputOutput` object, which captures other generic types such as `{ "type": "json", "value": {...} }`. | +| error | `Exception` | The error that occurred during the function execution, if any. It is automatically captured with the `@langwatch.span()` decorator and context manager. | +| timestamps | `SpanTimestamps` | The timestamps of the span, tracked by default when using the `@langwatch.span()` decorator and context manager. | +| timestamps.started_at | `int` | The start time of the span in milliseconds, the current time is used by default when the span starts. | +| timestamps.first_token_at | `int` | The time when the first token was generated in milliseconds, automatically tracked for streaming LLMs when using framework integrations. | +| timestamps.finished_at | `int` | The time when the span finished in milliseconds, the current time is used by default when the span ends. | +| contexts | `list[str] \| list[RAGChunk]` | **RAG only:** The list of contexts retrieved by the RAG, manually captured to be used later as the context source for RAG evaluators. Check out the [Capturing a RAG Span](/integration/python/guide#capturing-a-rag-span) guide for more information. | +| model | `str` | **LLM only:** The model used for the LLM in the `"vendor/model"` format (e.g. `"openai/gpt-3.5-turbo"`), automatically captured when using framework integrations, otherwise important to manually set it for correct tokens and costs tracking. | +| params | `LLMSpanParams` | **LLM only:** The parameters used for the LLM, on which parameters were used by the LLM call, automatically captured when using framework integrations | +| params.temperature | `float` | **LLM only:** The temperature used for the LLM | +| params.stream | `bool` | **LLM only:** Whether the LLM is streaming or not | +| params.tools | `list[dict]` | **LLM only:** OpenAI-compatible tools list available to the LLM | +| params.tool_choice | `str` | **LLM only:** The OpenAI-compatible tool_choice setting for the LLM | +| metrics | `LLMSpanMetrics` | **LLM only:** The metrics of the LLM span, automatically captured when using framework integrations | +| metrics.prompt_tokens | `int` | **LLM only:** The number of prompt tokens used by the LLM | +| metrics.completion_tokens | `int` | **LLM only:** The number of completion tokens used by the LLM | \ No newline at end of file diff --git a/integration/rags-context-tracking.mdx b/integration/rags-context-tracking.mdx new file mode 100644 index 000000000..fdb814f4c --- /dev/null +++ b/integration/rags-context-tracking.mdx @@ -0,0 +1,107 @@ +--- +title: "RAG Context Tracking" +--- + +Retrieval Augmented Generation (RAGs) is a common way to augment the generation of your LLM by retrieving a set of documents based on the user query and giving it to the LLM to use as context for answering, either by using a vector database, getting responses from an API, or integrated agent files and memory. + +It can be challenging, however, to build a good quality RAG pipeline, making sure the right data was retrieved, preventing the LLM from hallucinating, monitor which documents are the most used and keep iterating to improve it, this is where integrating with LangWatch can help, by integrating your RAG you unlock a series of Guardrails, Measurements and Analytics for RAGs LangWatch. + +import PythonLangChainRAG from "/snippets/python-langchain-rag.mdx"; +import PythonRAGSpan from "/snippets/python-rag-span.mdx"; +import TypeScriptRAG from "/snippets/typescript-rag.mdx"; + + + + + + + + + + + + +To track the RAG context when using the REST API, add a new span of type `rag`, you may also refer the LLM generation as the child of it: + +```bash +curl -X POST "https://app.langwatch.ai/api/collector" \\ + -H "X-Auth-Token: $API_KEY" \\ + -H "Content-Type: application/json" \\ + -d @- < + diff --git a/integration/rest-api.mdx b/integration/rest-api.mdx new file mode 100644 index 000000000..7d51bdd0b --- /dev/null +++ b/integration/rest-api.mdx @@ -0,0 +1,104 @@ +--- +title: REST API Integration +--- + +If your preferred programming language or platform is not directly supported by the existing LangWatch libraries, you can use the REST API with `curl` to send trace data. This guide will walk you through how to integrate LangWatch with any system that allows HTTP requests. + +import LLMsTxtProtip from "/snippets/llms-txt-protip.mdx"; + + + +**Prerequisites:** + +- Ensure you have `curl` installed on your system. + +**Configuration:** + +Set the `LANGWATCH_API_KEY` environment variable in your environment: + +```bash +export LANGWATCH_API_KEY='your_api_key_here' +``` + +**Usage:** + +You will need to prepare your span data in accordance with the Span type definitions provided by LangWatch. Below is an example of how to send span data using curl: + + 1. Prepare your JSON data. Make sure it's properly formatted as expected by LangWatch. + 2. Use the curl command to send your trace data. Here is a basic template: + +```bash +# Set your API key and endpoint URL +LANGWATCH_API_KEY="your_langwatch_api_key" +LANGWATCH_ENDPOINT="https://app.langwatch.ai" + +# Use curl to send the POST request, e.g.: +curl -X POST "$LANGWATCH_ENDPOINT/api/collector" \ + -H "X-Auth-Token: $LANGWATCH_API_KEY" \ + -H "Content-Type: application/json" \ + -d @- < +
+ + LangWatch TypeScript Repo + +
+ +
+ + LangWatch TypeScript SDK version + +
+ + +LangWatch library is the easiest way to integrate your TypeScript application with LangWatch, the messages are synced on the background so it doesn't intercept or block your LLM calls. + +import LLMsTxtProtip from "/snippets/llms-txt-protip.mdx"; + + + +import Prerequisites from "/snippets/prerequests-ts.mdx"; + + + + +## Basic Concepts + +- Each message triggering your LLM pipeline as a whole is captured with a [Trace](/concepts#traces). +- A [Trace](/concepts#traces) contains multiple [Spans](/concepts#spans), which are the steps inside your pipeline. + - A span can be an LLM call, a database query for a RAG retrieval, or a simple function transformation. + - Different types of [Spans](/concepts#spans) capture different parameters. + - [Spans](/concepts#spans) can be nested to capture the pipeline structure. +- [Traces](/concepts#traces) can be grouped together on LangWatch Dashboard by having the same [`thread_id`](/concepts#threads) in their metadata, making the individual messages become part of a conversation. + - It is also recommended to provide the [`user_id`](/concepts#user-id) metadata to track user analytics. + + +## Integration + + + +The Vercel AI SDK supports tracing via Next.js OpenTelemetry integration. By using the `LangWatchExporter`, you can automatically collect those traces to LangWatch. + +First, you need to install the necessary dependencies: + +```bash +npm install @vercel/otel langwatch @opentelemetry/api-logs @opentelemetry/instrumentation @opentelemetry/sdk-logs +``` + +Then, set up the OpenTelemetry for your application, follow one of the tabs below depending whether you are using AI SDK with Next.js or on Node.js: + + + +You need to enable the `instrumentationHook` in your `next.config.js` file if you haven't already: + +```javascript +/** @type {import('next').NextConfig} */ +const nextConfig = { + experimental: { + instrumentationHook: true, + }, +}; + +module.exports = nextConfig; +``` + +Next, you need to create a file named `instrumentation.ts` (or `.js`) in the __root directory__ of the project (or inside `src` folder if using one), with `LangWatchExporter` as the traceExporter: + +```typescript +import { registerOTel } from '@vercel/otel' +import { LangWatchExporter } from 'langwatch' + +export function register() { + registerOTel({ + serviceName: 'next-app', + traceExporter: new LangWatchExporter({ + apiKey: process.env.LANGWATCH_API_KEY + }) + }) +} +``` + +(Read more about Next.js OpenTelemetry configuration [on the official guide](https://nextjs.org/docs/app/building-your-application/optimizing/open-telemetry#manual-opentelemetry-configuration)) + +Finally, enable `experimental_telemetry` tracking on the AI SDK calls you want to trace: + +```typescript +const result = await generateText({ + model: openai('gpt-4o-mini'), + prompt: 'Explain why a chicken would make a terrible astronaut, be creative and humorous about it.', + experimental_telemetry: { + isEnabled: true, + // optional metadata + metadata: { + userId: "myuser-123", + threadId: "mythread-123", + }, + }, +}); +``` + + +For Node.js, start by following the official OpenTelemetry guide: + +- [OpenTelemetry Node.js Getting Started](https://opentelemetry.io/docs/languages/js/getting-started/nodejs/) + +Once you have set up OpenTelemetry, you can use the `LangWatchExporter` to automatically send your traces to LangWatch: + +```typescript +import { LangWatchExporter } from 'langwatch' + +const sdk = new NodeSDK({ + traceExporter: new LangWatchExporter({ + apiKey: process.env.LANGWATCH_API_KEY + }), + // ... +}); +``` + + + +That's it! Your messages will now be visible on LangWatch: + +![Vercel AI SDK](/images/integration/vercel-ai-sdk.png) + +## Example Project + +You can find a full example project with a more complex pipeline and Vercel AI SDK and LangWatch integration [on our GitHub](https://github.com/langwatch/langwatch/blob/main/typescript-sdk/example/lib/chat/vercel-ai.tsx). + +## Manual Integration + +The docs from here below are for manual integration, in case you are not using the Vercel AI SDK OpenTelemetry integration, +you can manually start a trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +```typescript +import { convertFromVercelAIMessages } from 'langwatch' + +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: convertFromVercelAIMessages(messages) + }, +}); +``` + +This will capture the LLM input and register the time the call started. Once the LLM call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation, e.g.: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: convertFromVercelAIMessages(output), // assuming output is Message[] + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + + + +Start by initializing LangWatch client and creating a new trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then to capture your LLM calls, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +First, define the model and the messages you are going to use for your LLM call separately, so you can capture them: + +```typescript +import { OpenAI } from "openai"; + +// Model to be used and messages that will be sent to the LLM +const model = "gpt-4o" +const messages : OpenAI.Chat.ChatCompletionMessageParam[] = [ + { role: "system", content: "You are a helpful assistant." }, + { + role: "user", + content: "Write a tweet-size vegetarian lasagna recipe for 4 people.", + }, +] +``` + +Then, start the LLM span from the trace, giving it the model and input messages: + +```typescript +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: messages + }, +}); +``` + +This will capture the LLM input and register the time the call started. Now, continue with the LLM call normally, using the same parameters: + +```typescript +const openai = new OpenAI(); +const chatCompletion = await openai.chat.completions.create({ + messages: messages, + model: model, +}); +``` + +Finally, after the OpenAI call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: [chatCompletion.choices[0]!.message], + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + + +Start by initializing LangWatch client and creating a new trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then to capture your LLM calls, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +First, define the model and the messages you are going to use for your LLM call separately, so you can capture them: + +```typescript +import { AzureOpenAI } from "openai"; + +// Model to be used and messages that will be sent to the LLM +const model = "gpt-4-turbo-2024-04-09" +const messages : OpenAI.Chat.ChatCompletionMessageParam[] = [ + { role: "system", content: "You are a helpful assistant." }, + { + role: "user", + content: "Write a tweet-size vegetarian lasagna recipe for 4 people.", + }, +] +``` + +Then, start the LLM span from the trace, giving it the model and input messages: + +```typescript +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: messages + }, +}); +``` + +This will capture the LLM input and register the time the call started. Now, continue with the LLM call normally, using the same parameters: + +```typescript +const openai = new AzureOpenAI({ + apiKey: process.env.AZURE_OPENAI_API_KEY, + apiVersion: "2024-02-01", + endpoint: process.env.AZURE_OPENAI_ENDPOINT, +}); +const chatCompletion = await openai.chat.completions.create({ + messages: messages, + model: model, +}); +``` + +Finally, after the OpenAI call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: [chatCompletion.choices[0]!.message], + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + + +Start by initializing LangWatch client and creating a new trace to capture your chain: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then, to capture your LLM calls and all other chain steps, LangWatch provides a callback hook for LangChain.js that automatically tracks everything for you. + +First, define your chain as you would normally do: + +```typescript +import { StringOutputParser } from '@langchain/core/output_parsers' +import { ChatPromptTemplate } from '@langchain/core/prompts' +import { ChatOpenAI } from '@langchain/openai' + +const prompt = ChatPromptTemplate.fromMessages([ + ['system', 'Translate the following from English into Italian'], + ['human', '{input}'] +]) +const model = new ChatOpenAI({ model: 'gpt-3.5-turbo' }) +const outputParser = new StringOutputParser() + +const chain = prompt.pipe(model).pipe(outputParser) +``` + +Now, when calling your chain either with `invoke` or `stream`, pass in `trace.getLangChainCallback()` as one of the callbacks: + +```typescript +const stream = await chain.stream( + { input: message }, + { callbacks: [trace.getLangChainCallback()] } +) +``` + +That's it! The full trace with all spans for each chain step will be sent automatically to LangWatch in the background on periodic intervals. After capturing your first LLM Span, go to [LangWatch Dashboard](https://app.langwatch.com), your message should be there! + + + + +On short-live environments like Lambdas or Serverless Functions, be sure to call
`await trace.sendSpans();` to wait for all pending requests to be sent before the runtime is destroyed. +
+ +## Capture a RAG Span + +Appart from LLM spans, another very used type of span is the RAG span. This is used to capture the retrieved contexts from a RAG that will be used by the LLM, and enables a whole new set of RAG-based features evaluations for RAG quality on LangWatch. + +import TypeScriptRAG from "/snippets/typescript-rag.mdx"; + + + +## Capture an arbritary Span + +You can also use generic spans to capture any type of operation, its inputs and outputs, for example for a function call: + +```typescript +// before the function starts +const span = trace.startSpan({ + name: "weather_function", + input: { + type: "json", + value: { + city: "Tokyo", + }, + }, +}); + +// ...after the function ends +span.end({ + output: { + type: "json", + value: { + weather: "sunny", + }, + }, +}); +``` + +You can also nest spans one inside the other, capturing your pipeline structure, for example: + +```typescript +const span = trace.startSpan({ + name: "pipeline", +}); + +const nestedSpan = span.startSpan({ + name: "nested_pipeline", +}); + +nestedSpan.end() + +span.end() +``` + +Both LLM and RAG spans can also be nested like any arbritary span. + +## Capturing Exceptions + +To capture also when your code throws an exception, you can simply wrap your code around a try/catch, and update or end the span with the exception: + +```typescript +try { + throw new Error("unexpected error"); +} catch (error) { + span.end({ + error: error, + }); +} +``` + +## Capturing custom evaluation results + +[LangWatch Evaluators](/evaluations/overview) can run automatically on your traces, but if you have an in-house custom evaluator, you can also capture the evaluation +results of your custom evaluator on the current trace or span by using the `.addEvaluation` method: + +import TypeScriptCustomEvaluation from "/snippets/typescript-custom-evaluation.mdx" + + \ No newline at end of file diff --git a/introduction.mdx b/introduction.mdx index 2589c0b93..4924d7718 100644 --- a/introduction.mdx +++ b/introduction.mdx @@ -1,71 +1,31 @@ --- title: Introduction -description: 'Welcome to the home of your new documentation' --- -Hero Light -Hero Dark +Welcome to LangWatch, the all-in-one [open-source](https://github.com/langwatch/langwatch) LLMops platform. -## Setting up +LangWatch allows you to track, monitor, guardrail and evaluate your LLMs apps for measuring quality and alert on issues. -The first step to world-class documentation is setting up your editing environments. +For domain experts, it allows you to easily sift through conversations, see topics being discussed and annotate and score messages +for improvement in a collaborative manner with the development team. - - - Get your docs set up locally for easy development - - - Preview your changes before you push to make sure they're perfect - - +For developers, it allows you to debug, build datasets, prompt engineer on the playground and +run batch evaluations or [DSPy experiments](./dspy-visualization/quickstart) to continuously improve the product. -## Make it yours +Finally, for the business, it allows you to track conversation metrics and give full user and quality analytics, cost tracking, build +custom dashboards and even integrate it back on your own platform for reporting to your customers. -Update your docs to your brand and add valuable content for the best user conversion. +You can [sign up](https://app.langwatch.ai/) and already start the integration on our free tier by following the guides bellow: - - Customize your docs to your company's colors and brands - - - Automatically generate endpoints from an OpenAPI spec - - - Build interactive features and designs to guide your users - - - Check out our showcase of our favorite documentation - + + + + +You can also [open the demo project](https://app.langwatch.ai/demo) check out a [video](https://www.loom.com/share/17f827b1f5a648298779b36e2dc959e6) on our platform. + +## Get in touch + +Feel free to reach out to us directly at [support@langwatch.ai](mailto:support@langwatch.ai). You can also open a [GitHub issue](https://github.com/langwatch/langwatch/issues) +to report bugs and request features, or join our [Discord](https://discord.gg/kT4PhDS2gH) channel and ask questions directly for the community and the core team. \ No newline at end of file diff --git a/langevals/api-reference/endpoint/azure-jailbreak-detection.mdx b/langevals/api-reference/endpoint/azure-jailbreak-detection.mdx new file mode 100644 index 000000000..2671daf8f --- /dev/null +++ b/langevals/api-reference/endpoint/azure-jailbreak-detection.mdx @@ -0,0 +1,4 @@ +--- +title: 'Jailbreak Detection' +openapi: 'POST /azure/jailbreak/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/azure-prompt-injection-detection.mdx b/langevals/api-reference/endpoint/azure-prompt-injection-detection.mdx new file mode 100644 index 000000000..6aade1990 --- /dev/null +++ b/langevals/api-reference/endpoint/azure-prompt-injection-detection.mdx @@ -0,0 +1,4 @@ +--- +title: 'Prompt Injection Detection' +openapi: 'POST /azure/prompt_injection/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/competitor-blocklist.mdx b/langevals/api-reference/endpoint/competitor-blocklist.mdx new file mode 100644 index 000000000..a43bab0e3 --- /dev/null +++ b/langevals/api-reference/endpoint/competitor-blocklist.mdx @@ -0,0 +1,4 @@ +--- +title: 'Prompt Injection Detection' +openapi: 'POST /langevals/competitor_blocklist/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/competitor-detection-llm.mdx b/langevals/api-reference/endpoint/competitor-detection-llm.mdx new file mode 100644 index 000000000..05fd49ace --- /dev/null +++ b/langevals/api-reference/endpoint/competitor-detection-llm.mdx @@ -0,0 +1,4 @@ +--- +title: 'Competitor Detection with LLM' +openapi: 'POST /langevals/competitor_llm/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/content-safety.mdx b/langevals/api-reference/endpoint/content-safety.mdx new file mode 100644 index 000000000..a00118ce9 --- /dev/null +++ b/langevals/api-reference/endpoint/content-safety.mdx @@ -0,0 +1,4 @@ +--- +title: 'Content Safety' +openapi: 'POST /azure/content_safety/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/google-cloud-dlp-pii-detection.mdx b/langevals/api-reference/endpoint/google-cloud-dlp-pii-detection.mdx new file mode 100644 index 000000000..72051d17e --- /dev/null +++ b/langevals/api-reference/endpoint/google-cloud-dlp-pii-detection.mdx @@ -0,0 +1,4 @@ +--- +title: 'PII Detection' +openapi: 'POST /google_cloud/dlp_pii_detection/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/haystack-faithfulness.mdx b/langevals/api-reference/endpoint/haystack-faithfulness.mdx new file mode 100644 index 000000000..b5a690b1e --- /dev/null +++ b/langevals/api-reference/endpoint/haystack-faithfulness.mdx @@ -0,0 +1,4 @@ +--- +title: 'Haystack Faithfulness' +openapi: 'POST /haystack/faithfulness/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/lingua-language-detection.mdx b/langevals/api-reference/endpoint/lingua-language-detection.mdx new file mode 100644 index 000000000..80e0623db --- /dev/null +++ b/langevals/api-reference/endpoint/lingua-language-detection.mdx @@ -0,0 +1,4 @@ +--- +title: 'Language Detection' +openapi: 'POST /lingua/language_detection/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/llama-guard.mdx b/langevals/api-reference/endpoint/llama-guard.mdx new file mode 100644 index 000000000..4610350c8 --- /dev/null +++ b/langevals/api-reference/endpoint/llama-guard.mdx @@ -0,0 +1,4 @@ +--- +title: 'Llama Guard' +openapi: 'POST /huggingface/llama_guard/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/llm-basic-evaluator.mdx b/langevals/api-reference/endpoint/llm-basic-evaluator.mdx new file mode 100644 index 000000000..c09844d24 --- /dev/null +++ b/langevals/api-reference/endpoint/llm-basic-evaluator.mdx @@ -0,0 +1,4 @@ +--- +title: 'LLM Basic Evaluator' +openapi: 'POST /langevals/basic/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/llm-boolean-evaluator.mdx b/langevals/api-reference/endpoint/llm-boolean-evaluator.mdx new file mode 100644 index 000000000..833f78b23 --- /dev/null +++ b/langevals/api-reference/endpoint/llm-boolean-evaluator.mdx @@ -0,0 +1,4 @@ +--- +title: 'LLM Boolean Evaluator' +openapi: 'POST /langevals/llm_boolean/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/llm-score-evaluator.mdx b/langevals/api-reference/endpoint/llm-score-evaluator.mdx new file mode 100644 index 000000000..63254acf5 --- /dev/null +++ b/langevals/api-reference/endpoint/llm-score-evaluator.mdx @@ -0,0 +1,4 @@ +--- +title: 'LLM Score Evaluator' +openapi: 'POST /langevals/llm_score/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/llm-similarity-evaluator.mdx b/langevals/api-reference/endpoint/llm-similarity-evaluator.mdx new file mode 100644 index 000000000..be38354b8 --- /dev/null +++ b/langevals/api-reference/endpoint/llm-similarity-evaluator.mdx @@ -0,0 +1,4 @@ +--- +title: 'LLM Similarity Evaluator' +openapi: 'POST /langevals/similarity/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/off-topic-detection.mdx b/langevals/api-reference/endpoint/off-topic-detection.mdx new file mode 100644 index 000000000..f16f2709d --- /dev/null +++ b/langevals/api-reference/endpoint/off-topic-detection.mdx @@ -0,0 +1,4 @@ +--- +title: 'Off-Topic Detection' +openapi: 'POST /langevals/off_topic/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/openai-moderation.mdx b/langevals/api-reference/endpoint/openai-moderation.mdx new file mode 100644 index 000000000..fb17a87e0 --- /dev/null +++ b/langevals/api-reference/endpoint/openai-moderation.mdx @@ -0,0 +1,4 @@ +--- +title: 'OpenAI Moderation' +openapi: 'POST /openai/moderation/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-answer-relevancy.mdx b/langevals/api-reference/endpoint/ragas-answer-relevancy.mdx new file mode 100644 index 000000000..8b0aebd1a --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-answer-relevancy.mdx @@ -0,0 +1,4 @@ +--- +title: 'Answer Relevancy' +openapi: 'POST /ragas/answer_relevancy/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-context-precision.mdx b/langevals/api-reference/endpoint/ragas-context-precision.mdx new file mode 100644 index 000000000..ec03e8904 --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-context-precision.mdx @@ -0,0 +1,4 @@ +--- +title: 'Context Precision' +openapi: 'POST /ragas/context_precision/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-context-recall.mdx b/langevals/api-reference/endpoint/ragas-context-recall.mdx new file mode 100644 index 000000000..f573a63ef --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-context-recall.mdx @@ -0,0 +1,4 @@ +--- +title: 'Context Recall' +openapi: 'POST /ragas/context_recall/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-context-relevancy.mdx b/langevals/api-reference/endpoint/ragas-context-relevancy.mdx new file mode 100644 index 000000000..0d5064216 --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-context-relevancy.mdx @@ -0,0 +1,4 @@ +--- +title: 'Context Relevancy' +openapi: 'POST /ragas/context_relevancy/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-context-utilization.mdx b/langevals/api-reference/endpoint/ragas-context-utilization.mdx new file mode 100644 index 000000000..74350a9d9 --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-context-utilization.mdx @@ -0,0 +1,4 @@ +--- +title: 'Context Utilization' +openapi: 'POST /ragas/context_utilization/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/endpoint/ragas-faithfulness.mdx b/langevals/api-reference/endpoint/ragas-faithfulness.mdx new file mode 100644 index 000000000..8e0823ba8 --- /dev/null +++ b/langevals/api-reference/endpoint/ragas-faithfulness.mdx @@ -0,0 +1,4 @@ +--- +title: 'Faithfulness' +openapi: 'POST /ragas/faithfulness/evaluate' +--- \ No newline at end of file diff --git a/langevals/api-reference/introduction.mdx b/langevals/api-reference/introduction.mdx new file mode 100644 index 000000000..2bed4a4b6 --- /dev/null +++ b/langevals/api-reference/introduction.mdx @@ -0,0 +1,39 @@ +The LangEvals server can be run both locally and on a server. +This guide will quickly explain how to set up LangEvals on your machine and make API calls to the evaluators. + + + +## Set Up Locally + + + +Poetry is used for managing virtual environments in LangEvals. Follow the official [Poetry installation guide](https://python-poetry.org/docs/) for detailed instructions. + + + +Once Poetry is installed, you can start the LangEvals server with the following command from the root of the project: +```bash +poetry run python langevals/server.py +``` + + + +With the server running, you can now make API calls to the evaluators on your machine or server: +```bash +curl -X POST "https://http://127.0.0.1:8000/ragas/context_precision/evaluate" \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer YOUR_API_KEY" \ +-d '{ + "contexts": ["Context 1", "Context 2"], + "relevant_items": ["Relevant item 1", "Relevant item 2"] +}' +``` + + +Great job! Now you can easily use evaluators on your machine and assess your LLMs in the way you want it. +For detailed API documentation please refer to [API Reference](/api-reference/) + + + + + diff --git a/langevals/api-reference/openapi.json b/langevals/api-reference/openapi.json new file mode 100644 index 000000000..dcd7378a0 --- /dev/null +++ b/langevals/api-reference/openapi.json @@ -0,0 +1,4275 @@ +{ + "openapi": "3.1.0", + "info": { "title": "LangEvals API", "version": "0.1.0" }, + "paths": { + "/aws/comprehend_pii_detection/evaluate": { + "post": { + "summary": "Aws Comprehend Pii Detection Evaluate", + "description": "Amazon Comprehend PII detects personally identifiable information in text, including phone numbers, email addresses, and\nsocial security numbers. It allows customization of the detection threshold and the specific types of PII to check.\n\n\n__Env vars:__ AWS_COMPREHEND_ACCESS_KEY_ID, AWS_COMPREHEND_SECRET_ACCESS_KEY\n\n__Docs:__ https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html", + "operationId": "aws_comprehend_pii_detection_evaluate_aws_comprehend_pii_detection_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__1" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/AWSComprehendPIIDetectionResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Aws Comprehend Pii Detection Evaluate Aws Comprehend Pii Detection Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/lingua/language_detection/evaluate": { + "post": { + "summary": "Lingua Language Detection Evaluate", + "description": "This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt,\nor if it's in a specific expected language.\n\n\n__Docs:__ https://github.com/pemistahl/lingua-py", + "operationId": "lingua_language_detection_evaluate_lingua_language_detection_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__2" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/LinguaLanguageDetectionResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Lingua Language Detection Evaluate Lingua Language Detection Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/answer_relevancy/evaluate": { + "post": { + "summary": "Ragas Answer Relevancy Evaluate", + "description": "This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/answer_relevance.html", + "operationId": "ragas_answer_relevancy_evaluate_ragas_answer_relevancy_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__3" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Answer Relevancy Evaluate Ragas Answer Relevancy Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/context_precision/evaluate": { + "post": { + "summary": "Ragas Context Precision Evaluate", + "description": "This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", + "operationId": "ragas_context_precision_evaluate_ragas_context_precision_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__4" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Context Precision Evaluate Ragas Context Precision Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/context_recall/evaluate": { + "post": { + "summary": "Ragas Context Recall Evaluate", + "description": "This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html", + "operationId": "ragas_context_recall_evaluate_ragas_context_recall_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__5" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Context Recall Evaluate Ragas Context Recall Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/context_relevancy/evaluate": { + "post": { + "summary": "Ragas Context Relevancy Evaluate", + "description": "This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/context_relevancy.html", + "operationId": "ragas_context_relevancy_evaluate_ragas_context_relevancy_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__6" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Context Relevancy Evaluate Ragas Context Relevancy Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/context_utilization/evaluate": { + "post": { + "summary": "Ragas Context Utilization Evaluate", + "description": "This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", + "operationId": "ragas_context_utilization_evaluate_ragas_context_utilization_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__7" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Context Utilization Evaluate Ragas Context Utilization Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/ragas/faithfulness/evaluate": { + "post": { + "summary": "Ragas Faithfulness Evaluate", + "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.\n\n\n__Docs:__ https://docs.ragas.io/en/latest/concepts/metrics/faithfulness.html", + "operationId": "ragas_faithfulness_evaluate_ragas_faithfulness_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__8" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Ragas Faithfulness Evaluate Ragas Faithfulness Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/azure/content_safety/evaluate": { + "post": { + "summary": "Azure Content Safety Evaluate", + "description": "This evaluator detects potentially unsafe content in text, including hate speech,\nself-harm, sexual content, and violence. It allows customization of the severity\nthreshold and the specific categories to check.\n\n\n__Env vars:__ AZURE_CONTENT_SAFETY_ENDPOINT, AZURE_CONTENT_SAFETY_KEY\n\n__Docs:__ https://learn.microsoft.com/en-us/azure/ai-services/content-safety/quickstart-text", + "operationId": "azure_content_safety_evaluate_azure_content_safety_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__9" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/AzureContentSafetyResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Azure Content Safety Evaluate Azure Content Safety Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/azure/jailbreak/evaluate": { + "post": { + "summary": "Azure Jailbreak Evaluate", + "description": "This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API.\n\n\n__Env vars:__ AZURE_CONTENT_SAFETY_ENDPOINT, AZURE_CONTENT_SAFETY_KEY", + "operationId": "azure_jailbreak_evaluate_azure_jailbreak_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__10" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/AzureJailbreakResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Azure Jailbreak Evaluate Azure Jailbreak Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/azure/prompt_injection/evaluate": { + "post": { + "summary": "Azure Prompt Injection Evaluate", + "description": "This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API.\n\n\n__Env vars:__ AZURE_CONTENT_SAFETY_ENDPOINT, AZURE_CONTENT_SAFETY_KEY\n\n__Docs:__ https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/jailbreak-detection", + "operationId": "azure_prompt_injection_evaluate_azure_prompt_injection_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__11" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/AzurePromptShieldResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Azure Prompt Injection Evaluate Azure Prompt Injection Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/example/word_count/evaluate": { + "post": { + "summary": "Example Word Count Evaluate", + "description": "This evaluator serves as a boilerplate for creating new evaluators.\n\n\n__Env vars:__ NECESSARY_ENV_VAR\n\n__Docs:__ https://path/to/official/docs", + "operationId": "example_word_count_evaluate_example_word_count_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__12" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/ExampleWordCountResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Example Word Count Evaluate Example Word Count Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/basic/evaluate": { + "post": { + "summary": "Langevals Basic Evaluate", + "description": "Allows you to check for simple text matches or regex evaluation.", + "operationId": "langevals_basic_evaluate_langevals_basic_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__13" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomBasicResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Basic Evaluate Langevals Basic Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/competitor_blocklist/evaluate": { + "post": { + "summary": "Langevals Competitor Blocklist Evaluate", + "description": "This evaluator checks if any of the specified competitors was mentioned\n\n\n__Docs:__ https://path/to/official/docs", + "operationId": "langevals_competitor_blocklist_evaluate_langevals_competitor_blocklist_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__14" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/CompetitorBlocklistResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Competitor Blocklist Evaluate Langevals Competitor Blocklist Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/competitor_llm/evaluate": { + "post": { + "summary": "Langevals Competitor Llm Evaluate", + "description": "This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly\n\n\n__Env vars:__ OPENAI_API_KEY, AZURE_API_KEY, AZURE_API_BASE", + "operationId": "langevals_competitor_llm_evaluate_langevals_competitor_llm_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__15" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/CompetitorLLMResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Competitor Llm Evaluate Langevals Competitor Llm Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/competitor_llm_function_call/evaluate": { + "post": { + "summary": "Langevals Competitor Llm Function Call Evaluate", + "description": "This evaluator implements LLM-as-a-judge with a function call approach to check if the message contains a mention of a competitor.\n\n\n__Env vars:__ OPENAI_API_KEY, AZURE_API_KEY, AZURE_API_BASE", + "operationId": "langevals_competitor_llm_function_call_evaluate_langevals_competitor_llm_function_call_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__16" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/CompetitorLLMFunctionCallResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Competitor Llm Function Call Evaluate Langevals Competitor Llm Function Call Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/llm_boolean/evaluate": { + "post": { + "summary": "Langevals Llm Boolean Evaluate", + "description": "Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message.", + "operationId": "langevals_llm_boolean_evaluate_langevals_llm_boolean_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__17" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomLLMBooleanResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Llm Boolean Evaluate Langevals Llm Boolean Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/llm_score/evaluate": { + "post": { + "summary": "Langevals Llm Score Evaluate", + "description": "Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message.", + "operationId": "langevals_llm_score_evaluate_langevals_llm_score_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__18" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomLLMScoreResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Llm Score Evaluate Langevals Llm Score Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/off_topic/evaluate": { + "post": { + "summary": "Langevals Off Topic Evaluate", + "description": "This evaluator checks if the user message is concerning one of the allowed topics of the chatbot\n\n\n__Env vars:__ OPENAI_API_KEY, AZURE_API_KEY, AZURE_API_BASE", + "operationId": "langevals_off_topic_evaluate_langevals_off_topic_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__19" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/OffTopicResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Off Topic Evaluate Langevals Off Topic Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/product_sentiment_polarity/evaluate": { + "post": { + "summary": "Langevals Product Sentiment Polarity Evaluate", + "description": "For messages about products, this evaluator checks for the nuanced sentiment direction of the LLM output, either very positive, subtly positive, subtly negative, or very negative.\n\n\n__Env vars:__ OPENAI_API_KEY, AZURE_API_KEY, AZURE_API_BASE", + "operationId": "langevals_product_sentiment_polarity_evaluate_langevals_product_sentiment_polarity_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__20" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/ProductSentimentPolarityResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Product Sentiment Polarity Evaluate Langevals Product Sentiment Polarity Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/langevals/similarity/evaluate": { + "post": { + "summary": "Langevals Similarity Evaluate", + "description": "Allows you to check for semantic similarity or dissimilarity between input and output and a\ntarget value, so you can avoid sentences that you don't want to be present without having to\nmatch on the exact text.\n\n\n__Env vars:__ OPENAI_API_KEY, AZURE_API_KEY, AZURE_API_BASE", + "operationId": "langevals_similarity_evaluate_langevals_similarity_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__21" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomSimilarityResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Langevals Similarity Evaluate Langevals Similarity Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/openai/moderation/evaluate": { + "post": { + "summary": "Openai Moderation Evaluate", + "description": "This evaluator uses OpenAI's moderation API to detect potentially harmful content in text,\nincluding harassment, hate speech, self-harm, sexual content, and violence.\n\n\n__Env vars:__ OPENAI_API_KEY\n\n__Docs:__ https://platform.openai.com/docs/guides/moderation/overview", + "operationId": "openai_moderation_evaluate_openai_moderation_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__22" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/OpenAIModerationResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Openai Moderation Evaluate Openai Moderation Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/huggingface/llama_guard/evaluate": { + "post": { + "summary": "Huggingface Llama Guard Evaluate", + "description": "This evaluator is a special version of Llama trained strictly\nfor acting as a guardrail, following customizable guidelines.\nIt can work both as a safety evaluator and as policy enforcement.\n\n\n__Env vars:__ CLOUDFLARE_ACCOUNT_ID, CLOUDFLARE_API_KEY\n\n__Docs:__ https://huggingface.co/meta-llama/LlamaGuard-7b", + "operationId": "huggingface_llama_guard_evaluate_huggingface_llama_guard_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__23" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { "$ref": "#/components/schemas/LlamaGuardResult" }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Huggingface Llama Guard Evaluate Huggingface Llama Guard Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/haystack/faithfulness/evaluate": { + "post": { + "summary": "Haystack Faithfulness Evaluate", + "description": "This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations.\n\n\n__Docs:__ https://docs.haystack.deepset.ai/docs/faithfulnessevaluator", + "operationId": "haystack_faithfulness_evaluate_haystack_faithfulness_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__24" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/HaystackFaithfulnessResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Haystack Faithfulness Evaluate Haystack Faithfulness Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + }, + "/google_cloud/dlp_pii_detection/evaluate": { + "post": { + "summary": "Google Cloud Dlp Pii Detection Evaluate", + "description": "Google DLP PII detects personally identifiable information in text, including phone numbers, email addresses, and\nsocial security numbers. It allows customization of the detection threshold and the specific types of PII to check.\n\n\n__Env vars:__ GOOGLE_APPLICATION_CREDENTIALS\n\n__Docs:__ https://cloud.google.com/sensitive-data-protection/docs/apis", + "operationId": "google_cloud_dlp_pii_detection_evaluate_google_cloud_dlp_pii_detection_evaluate_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/__main____create_evaluator_routes___locals___Request__25" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "anyOf": [ + { + "$ref": "#/components/schemas/GoogleCloudDLPPIIDetectionResult" + }, + { + "$ref": "#/components/schemas/EvaluationResultSkipped" + }, + { "$ref": "#/components/schemas/EvaluationResultError" } + ] + }, + "type": "array", + "title": "Response Google Cloud Dlp Pii Detection Evaluate Google Cloud Dlp Pii Detection Evaluate Post" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { "$ref": "#/components/schemas/HTTPValidationError" } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AWSComprehendEntityTypes": { + "properties": { + "BANK_ACCOUNT_NUMBER": { + "type": "boolean", + "title": "Bank Account Number", + "default": true + }, + "BANK_ROUTING": { + "type": "boolean", + "title": "Bank Routing", + "default": true + }, + "CREDIT_DEBIT_NUMBER": { + "type": "boolean", + "title": "Credit Debit Number", + "default": true + }, + "CREDIT_DEBIT_CVV": { + "type": "boolean", + "title": "Credit Debit Cvv", + "default": true + }, + "CREDIT_DEBIT_EXPIRY": { + "type": "boolean", + "title": "Credit Debit Expiry", + "default": true + }, + "PIN": { "type": "boolean", "title": "Pin", "default": true }, + "EMAIL": { "type": "boolean", "title": "Email", "default": true }, + "ADDRESS": { "type": "boolean", "title": "Address", "default": true }, + "NAME": { "type": "boolean", "title": "Name", "default": true }, + "PHONE": { "type": "boolean", "title": "Phone", "default": true }, + "SSN": { "type": "boolean", "title": "Ssn", "default": true }, + "DATE_TIME": { + "type": "boolean", + "title": "Date Time", + "default": true + }, + "PASSPORT_NUMBER": { + "type": "boolean", + "title": "Passport Number", + "default": true + }, + "DRIVER_ID": { + "type": "boolean", + "title": "Driver Id", + "default": true + }, + "URL": { "type": "boolean", "title": "Url", "default": true }, + "AGE": { "type": "boolean", "title": "Age", "default": true }, + "USERNAME": { + "type": "boolean", + "title": "Username", + "default": true + }, + "PASSWORD": { + "type": "boolean", + "title": "Password", + "default": true + }, + "AWS_ACCESS_KEY": { + "type": "boolean", + "title": "Aws Access Key", + "default": true + }, + "AWS_SECRET_KEY": { + "type": "boolean", + "title": "Aws Secret Key", + "default": true + }, + "IP_ADDRESS": { + "type": "boolean", + "title": "Ip Address", + "default": true + }, + "MAC_ADDRESS": { + "type": "boolean", + "title": "Mac Address", + "default": true + }, + "LICENSE_PLATE": { + "type": "boolean", + "title": "License Plate", + "default": true + }, + "VEHICLE_IDENTIFICATION_NUMBER": { + "type": "boolean", + "title": "Vehicle Identification Number", + "default": true + }, + "UK_NATIONAL_INSURANCE_NUMBER": { + "type": "boolean", + "title": "Uk National Insurance Number", + "default": true + }, + "CA_SOCIAL_INSURANCE_NUMBER": { + "type": "boolean", + "title": "Ca Social Insurance Number", + "default": true + }, + "US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER": { + "type": "boolean", + "title": "Us Individual Tax Identification Number", + "default": true + }, + "UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER": { + "type": "boolean", + "title": "Uk Unique Taxpayer Reference Number", + "default": true + }, + "IN_PERMANENT_ACCOUNT_NUMBER": { + "type": "boolean", + "title": "In Permanent Account Number", + "default": true + }, + "IN_NREGA": { + "type": "boolean", + "title": "In Nrega", + "default": true + }, + "INTERNATIONAL_BANK_ACCOUNT_NUMBER": { + "type": "boolean", + "title": "International Bank Account Number", + "default": true + }, + "SWIFT_CODE": { + "type": "boolean", + "title": "Swift Code", + "default": true + }, + "UK_NATIONAL_HEALTH_SERVICE_NUMBER": { + "type": "boolean", + "title": "Uk National Health Service Number", + "default": true + }, + "CA_HEALTH_NUMBER": { + "type": "boolean", + "title": "Ca Health Number", + "default": true + }, + "IN_AADHAAR": { + "type": "boolean", + "title": "In Aadhaar", + "default": true + }, + "IN_VOTER_NUMBER": { + "type": "boolean", + "title": "In Voter Number", + "default": true + } + }, + "type": "object", + "title": "AWSComprehendEntityTypes" + }, + "AWSComprehendPIIDetectionEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "AWSComprehendPIIDetectionEntry" + }, + "AWSComprehendPIIDetectionResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Amount of PII detected, 0 means no PII detected" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "If true then no PII was detected, if false then at least one PII was detected" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + }, + "raw_response": { "$ref": "#/components/schemas/AWSPIIEntityResults" } + }, + "type": "object", + "required": ["score", "passed", "raw_response"], + "title": "AWSComprehendPIIDetectionResult" + }, + "AWSComprehendPIIDetectionSettings": { + "properties": { + "entity_types": { + "allOf": [ + { "$ref": "#/components/schemas/AWSComprehendEntityTypes" } + ], + "description": "The types of PII to check for in the input.", + "default": { + "BANK_ACCOUNT_NUMBER": true, + "BANK_ROUTING": true, + "CREDIT_DEBIT_NUMBER": true, + "CREDIT_DEBIT_CVV": true, + "CREDIT_DEBIT_EXPIRY": true, + "PIN": true, + "EMAIL": true, + "ADDRESS": true, + "NAME": true, + "PHONE": true, + "SSN": true, + "DATE_TIME": true, + "PASSPORT_NUMBER": true, + "DRIVER_ID": true, + "URL": true, + "AGE": true, + "USERNAME": true, + "PASSWORD": true, + "AWS_ACCESS_KEY": true, + "AWS_SECRET_KEY": true, + "IP_ADDRESS": true, + "MAC_ADDRESS": true, + "LICENSE_PLATE": true, + "VEHICLE_IDENTIFICATION_NUMBER": true, + "UK_NATIONAL_INSURANCE_NUMBER": true, + "CA_SOCIAL_INSURANCE_NUMBER": true, + "US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER": true, + "UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER": true, + "IN_PERMANENT_ACCOUNT_NUMBER": true, + "IN_NREGA": true, + "INTERNATIONAL_BANK_ACCOUNT_NUMBER": true, + "SWIFT_CODE": true, + "UK_NATIONAL_HEALTH_SERVICE_NUMBER": true, + "CA_HEALTH_NUMBER": true, + "IN_AADHAAR": true, + "IN_VOTER_NUMBER": true + } + }, + "language_code": { + "type": "string", + "enum": [ + "en", + "es", + "fr", + "de", + "it", + "pt", + "ar", + "hi", + "ja", + "ko", + "zh", + "zh-TW" + ], + "title": "Language Code", + "description": "The language code of the input text for better PII detection, defaults to english.", + "default": "en" + }, + "min_confidence": { + "type": "number", + "title": "Min Confidence", + "description": "The minimum confidence required for failing the evaluation on a PII match.", + "default": 0.5 + }, + "aws_region": { + "type": "string", + "enum": [ + "us-east-1", + "us-east-2", + "us-west-1", + "us-west-2", + "ap-east-1", + "ap-south-1", + "ap-northeast-3", + "ap-northeast-2", + "ap-southeast-1", + "ap-southeast-2", + "ap-northeast-1", + "ca-central-1", + "eu-central-1", + "eu-west-1", + "eu-west-2", + "eu-south-1", + "eu-west-3", + "eu-north-1", + "me-south-1", + "sa-east-1" + ], + "title": "Aws Region", + "description": "The AWS region to use for running the PII detection, defaults to eu-central-1 for GDPR compliance.", + "default": "eu-central-1" + } + }, + "type": "object", + "title": "AWSComprehendPIIDetectionSettings" + }, + "AWSPIIEntityResult": { + "properties": { + "Name": { + "type": "string", + "enum": [ + "BANK_ACCOUNT_NUMBER", + "BANK_ROUTING", + "CREDIT_DEBIT_NUMBER", + "CREDIT_DEBIT_CVV", + "CREDIT_DEBIT_EXPIRY", + "PIN", + "EMAIL", + "ADDRESS", + "NAME", + "PHONE", + "SSN", + "DATE_TIME", + "PASSPORT_NUMBER", + "DRIVER_ID", + "URL", + "AGE", + "USERNAME", + "PASSWORD", + "AWS_ACCESS_KEY", + "AWS_SECRET_KEY", + "IP_ADDRESS", + "MAC_ADDRESS", + "ALL", + "LICENSE_PLATE", + "VEHICLE_IDENTIFICATION_NUMBER", + "UK_NATIONAL_INSURANCE_NUMBER", + "CA_SOCIAL_INSURANCE_NUMBER", + "US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER", + "UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER", + "IN_PERMANENT_ACCOUNT_NUMBER", + "IN_NREGA", + "INTERNATIONAL_BANK_ACCOUNT_NUMBER", + "SWIFT_CODE", + "UK_NATIONAL_HEALTH_SERVICE_NUMBER", + "CA_HEALTH_NUMBER", + "IN_AADHAAR", + "IN_VOTER_NUMBER" + ], + "title": "Name" + }, + "Score": { "type": "number", "title": "Score" } + }, + "type": "object", + "required": ["Name", "Score"], + "title": "AWSPIIEntityResult" + }, + "AWSPIIEntityResults": { + "properties": { + "Labels": { + "items": { "$ref": "#/components/schemas/AWSPIIEntityResult" }, + "type": "array", + "title": "Labels" + } + }, + "type": "object", + "required": ["Labels"], + "title": "AWSPIIEntityResults" + }, + "AllowedTopic": { + "properties": { + "topic": { "type": "string", "title": "Topic" }, + "description": { "type": "string", "title": "Description" } + }, + "type": "object", + "required": ["topic", "description"], + "title": "AllowedTopic" + }, + "AzureContentSafetyCategories": { + "properties": { + "Hate": { "type": "boolean", "title": "Hate", "default": true }, + "SelfHarm": { + "type": "boolean", + "title": "Selfharm", + "default": true + }, + "Sexual": { "type": "boolean", "title": "Sexual", "default": true }, + "Violence": { + "type": "boolean", + "title": "Violence", + "default": true + } + }, + "type": "object", + "title": "AzureContentSafetyCategories" + }, + "AzureContentSafetyEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "AzureContentSafetyEntry" + }, + "AzureContentSafetyResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "The severity level of the detected content from 0 to 7. A higher score indicates higher severity." + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "AzureContentSafetyResult" + }, + "AzureContentSafetySettings": { + "properties": { + "severity_threshold": { + "type": "integer", + "enum": [1, 2, 3, 4, 5, 6, 7], + "title": "Severity Threshold", + "description": "The minimum severity level to consider content as unsafe, from 1 to 7.", + "default": 1 + }, + "categories": { + "allOf": [ + { "$ref": "#/components/schemas/AzureContentSafetyCategories" } + ], + "description": "The categories of moderation to check for.", + "default": { + "Hate": true, + "SelfHarm": true, + "Sexual": true, + "Violence": true + } + }, + "output_type": { + "type": "string", + "enum": ["FourSeverityLevels", "EightSeverityLevels"], + "title": "Output Type", + "description": "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.", + "default": "FourSeverityLevels" + } + }, + "type": "object", + "title": "AzureContentSafetySettings" + }, + "AzureJailbreakEntry": { + "properties": { "input": { "type": "string", "title": "Input" } }, + "type": "object", + "required": ["input"], + "title": "AzureJailbreakEntry" + }, + "AzureJailbreakResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "No description provided" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "If true then no jailbreak was detected, if false then a jailbreak was detected" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "AzureJailbreakResult" + }, + "AzureJailbreakSettings": { + "properties": {}, + "type": "object", + "title": "AzureJailbreakSettings" + }, + "AzurePromptShieldEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "contexts": { + "anyOf": [ + { "items": { "type": "string" }, "type": "array" }, + { "type": "null" } + ], + "title": "Contexts" + } + }, + "type": "object", + "required": ["input"], + "title": "AzurePromptShieldEntry" + }, + "AzurePromptShieldResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "No description provided" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "If true then no prompt injection was detected, if false then a prompt injection was detected" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "AzurePromptShieldResult" + }, + "AzurePromptShieldSettings": { + "properties": {}, + "type": "object", + "title": "AzurePromptShieldSettings" + }, + "CompetitorBlocklistEntry": { + "properties": { + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + }, + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + } + }, + "type": "object", + "title": "CompetitorBlocklistEntry" + }, + "CompetitorBlocklistResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Number of competitors mentioned in the input and output" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Is the message containing explicit mention of competitor", + "default": false + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CompetitorBlocklistResult" + }, + "CompetitorBlocklistSettings": { + "properties": { + "competitors": { + "items": { "type": "string" }, + "type": "array", + "title": "Competitors", + "description": "The competitors that must not be mentioned.", + "default": ["OpenAI", "Google", "Microsoft"] + } + }, + "type": "object", + "title": "CompetitorBlocklistSettings" + }, + "CompetitorLLMEntry": { + "properties": { + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + }, + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + } + }, + "type": "object", + "title": "CompetitorLLMEntry" + }, + "CompetitorLLMFunctionCallEntry": { + "properties": { + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + }, + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + } + }, + "type": "object", + "title": "CompetitorLLMFunctionCallEntry" + }, + "CompetitorLLMFunctionCallResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Number of unique competitors mentioned" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Is the message related to the competitors", + "default": true + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CompetitorLLMFunctionCallResult" + }, + "CompetitorLLMFunctionCallSettings": { + "properties": { + "name": { + "type": "string", + "title": "Name", + "description": "The name of your company", + "default": "LangWatch" + }, + "description": { + "type": "string", + "title": "Description", + "description": "Description of what your company is specializing at", + "default": "We are providing an LLMFunctionCall observability and evaluation platform" + }, + "competitors": { + "items": { "type": "string" }, + "type": "array", + "title": "Competitors", + "description": "The competitors that must not be mentioned.", + "default": ["OpenAI", "Google", "Microsoft"] + }, + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4o", + "openai/gpt-4-turbo", + "openai/gpt-4-0125-preview", + "openai/gpt-4-1106-preview", + "azure/gpt-35-turbo-1106", + "azure/gpt-4-turbo-2024-04-09", + "azure/gpt-4-1106-preview", + "groq/llama3-70b-8192", + "anthropic/claude-3-haiku-20240307", + "anthropic/claude-3-sonnet-20240229", + "anthropic/claude-3-opus-20240229" + ], + "title": "Model", + "description": "The model to use for evaluation", + "default": "azure/gpt-35-turbo-1106" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "Max tokens allowed for evaluation", + "default": 4096 + } + }, + "type": "object", + "title": "CompetitorLLMFunctionCallSettings" + }, + "CompetitorLLMResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Confidence that the message is competitor free" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Is the message related to the competitors", + "default": true + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CompetitorLLMResult" + }, + "CompetitorLLMSettings": { + "properties": { + "name": { + "type": "string", + "title": "Name", + "description": "The name of your company", + "default": "LangWatch" + }, + "description": { + "type": "string", + "title": "Description", + "description": "Description of what your company is specializing at", + "default": "We are providing an LLM observability and evaluation platform" + }, + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4o", + "openai/gpt-4-turbo", + "openai/gpt-4-0125-preview", + "openai/gpt-4-1106-preview", + "azure/gpt-35-turbo-1106", + "azure/gpt-4-turbo-2024-04-09", + "azure/gpt-4-1106-preview", + "groq/llama3-70b-8192", + "anthropic/claude-3-haiku-20240307", + "anthropic/claude-3-sonnet-20240229", + "anthropic/claude-3-opus-20240229" + ], + "title": "Model", + "description": "The model to use for evaluation", + "default": "azure/gpt-35-turbo-1106" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "Max tokens allowed for evaluation", + "default": 4096 + } + }, + "type": "object", + "title": "CompetitorLLMSettings" + }, + "CustomBasicEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "CustomBasicEntry" + }, + "CustomBasicResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Returns 1 if all rules pass, 0 if any rule fails" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "default": true + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CustomBasicResult" + }, + "CustomBasicRule": { + "properties": { + "field": { + "type": "string", + "enum": ["input", "output"], + "title": "Field", + "default": "output" + }, + "rule": { + "type": "string", + "enum": [ + "contains", + "not_contains", + "matches_regex", + "not_matches_regex" + ], + "title": "Rule" + }, + "value": { "type": "string", "title": "Value" } + }, + "type": "object", + "required": ["rule", "value"], + "title": "CustomBasicRule" + }, + "CustomBasicSettings": { + "properties": { + "rules": { + "items": { "$ref": "#/components/schemas/CustomBasicRule" }, + "type": "array", + "title": "Rules", + "description": "List of rules to check, the message must pass all of them", + "default": [ + { + "field": "output", + "rule": "not_contains", + "value": "artificial intelligence" + } + ] + } + }, + "type": "object", + "title": "CustomBasicSettings" + }, + "CustomLLMBooleanEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + }, + "contexts": { + "anyOf": [ + { "items": { "type": "string" }, "type": "array" }, + { "type": "null" } + ], + "title": "Contexts" + } + }, + "type": "object", + "title": "CustomLLMBooleanEntry" + }, + "CustomLLMBooleanResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Returns 1 if LLM evaluates it as true, 0 if as false" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "The veredict given by the LLM", + "default": true + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CustomLLMBooleanResult" + }, + "CustomLLMBooleanSettings": { + "properties": { + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4o", + "openai/gpt-4-turbo", + "openai/gpt-4-0125-preview", + "openai/gpt-4-1106-preview", + "azure/gpt-35-turbo-1106", + "azure/gpt-4-turbo-2024-04-09", + "azure/gpt-4-1106-preview", + "groq/llama3-70b-8192", + "anthropic/claude-3-haiku-20240307", + "anthropic/claude-3-sonnet-20240229", + "anthropic/claude-3-opus-20240229" + ], + "title": "Model", + "description": "The model to use for evaluation", + "default": "azure/gpt-35-turbo-1106" + }, + "prompt": { + "type": "string", + "title": "Prompt", + "description": "The system prompt to use for the LLM to run the evaluation", + "default": "You are an LLM evaluator. We need the guarantee that the output answers what is being asked on the input, please evaluate as False if it doesn't" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "default": 8192 + } + }, + "type": "object", + "title": "CustomLLMBooleanSettings" + }, + "CustomLLMScoreEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + }, + "contexts": { + "anyOf": [ + { "items": { "type": "string" }, "type": "array" }, + { "type": "null" } + ], + "title": "Contexts" + } + }, + "type": "object", + "title": "CustomLLMScoreEntry" + }, + "CustomLLMScoreResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "The score given by the LLM, according to the prompt" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "CustomLLMScoreResult" + }, + "CustomLLMScoreSettings": { + "properties": { + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4o", + "openai/gpt-4-turbo", + "openai/gpt-4-0125-preview", + "openai/gpt-4-1106-preview", + "azure/gpt-35-turbo-1106", + "azure/gpt-4-turbo-2024-04-09", + "azure/gpt-4-1106-preview", + "groq/llama3-70b-8192", + "groq/llama3-8b-8192", + "anthropic/claude-3-haiku-20240307", + "anthropic/claude-3-sonnet-20240229", + "anthropic/claude-3-opus-20240229" + ], + "title": "Model", + "description": "The model to use for evaluation", + "default": "azure/gpt-35-turbo-1106" + }, + "prompt": { + "type": "string", + "title": "Prompt", + "description": "The system prompt to use for the LLM to run the evaluation", + "default": "You are an LLM evaluator. Please score from 0.0 to 1.0 how likely the user is to be satisfied with this answer, from 0.0 being not satisfied at all to 1.0 being completely satisfied" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "default": 8192 + } + }, + "type": "object", + "title": "CustomLLMScoreSettings" + }, + "CustomSimilarityEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "CustomSimilarityEntry" + }, + "CustomSimilarityResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "How similar the input and output semantically, from 0.0 to 1.0, with 1.0 meaning the sentences are identical" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Passes if the cosine similarity crosses the threshold for the defined rule" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score", "passed"], + "title": "CustomSimilarityResult" + }, + "CustomSimilaritySettings": { + "properties": { + "field": { + "type": "string", + "enum": ["input", "output"], + "title": "Field", + "default": "output" + }, + "rule": { + "type": "string", + "enum": ["is_not_similar_to", "is_similar_to"], + "title": "Rule", + "default": "is_not_similar_to" + }, + "value": { "type": "string", "title": "Value", "default": "example" }, + "threshold": { + "type": "number", + "title": "Threshold", + "default": 0.3 + }, + "embedding_model": { + "type": "string", + "enum": [ + "openai/text-embedding-3-small", + "azure/text-embedding-ada-002" + ], + "title": "Embedding Model", + "default": "openai/text-embedding-3-small" + } + }, + "type": "object", + "title": "CustomSimilaritySettings" + }, + "EvaluationResultError": { + "properties": { + "status": { + "type": "string", + "enum": ["error"], + "const": "error", + "title": "Status", + "default": "error" + }, + "error_type": { + "type": "string", + "title": "Error Type", + "description": "The type of the exception" + }, + "message": { + "type": "string", + "title": "Message", + "description": "Error message" + }, + "traceback": { + "items": { "type": "string" }, + "type": "array", + "title": "Traceback", + "description": "Traceback information for debugging" + } + }, + "type": "object", + "required": ["error_type", "message", "traceback"], + "title": "EvaluationResultError", + "description": "Evaluation result marking an entry that failed to be processed due to an error." + }, + "EvaluationResultSkipped": { + "properties": { + "status": { + "type": "string", + "enum": ["skipped"], + "const": "skipped", + "title": "Status", + "default": "skipped" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details" + } + }, + "type": "object", + "title": "EvaluationResultSkipped", + "description": "Evaluation result marking an entry that was skipped with an optional details explanation." + }, + "ExampleWordCountEntry": { + "properties": { "output": { "type": "string", "title": "Output" } }, + "type": "object", + "required": ["output"], + "title": "ExampleWordCountEntry" + }, + "ExampleWordCountResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "How many words are there in the output, split by space" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "ExampleWordCountResult" + }, + "ExampleWordCountSettings": { + "properties": {}, + "type": "object", + "title": "ExampleWordCountSettings" + }, + "GoogleCloudDLPInfoTypes": { + "properties": { + "phone_number": { + "type": "boolean", + "title": "Phone Number", + "default": true + }, + "email_address": { + "type": "boolean", + "title": "Email Address", + "default": true + }, + "credit_card_number": { + "type": "boolean", + "title": "Credit Card Number", + "default": true + }, + "iban_code": { + "type": "boolean", + "title": "Iban Code", + "default": true + }, + "ip_address": { + "type": "boolean", + "title": "Ip Address", + "default": true + }, + "passport": { + "type": "boolean", + "title": "Passport", + "default": true + }, + "vat_number": { + "type": "boolean", + "title": "Vat Number", + "default": true + }, + "medical_record_number": { + "type": "boolean", + "title": "Medical Record Number", + "default": true + } + }, + "type": "object", + "title": "GoogleCloudDLPInfoTypes" + }, + "GoogleCloudDLPPIIDetectionEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "GoogleCloudDLPPIIDetectionEntry" + }, + "GoogleCloudDLPPIIDetectionResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Amount of PII detected, 0 means no PII detected" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "If true then no PII was detected, if false then at least one PII was detected" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + }, + "raw_response": { "type": "object", "title": "Raw Response" } + }, + "type": "object", + "required": ["score", "passed", "raw_response"], + "title": "GoogleCloudDLPPIIDetectionResult" + }, + "GoogleCloudDLPPIIDetectionSettings": { + "properties": { + "info_types": { + "allOf": [ + { "$ref": "#/components/schemas/GoogleCloudDLPInfoTypes" } + ], + "description": "The types of PII to check for in the input.", + "default": { + "phone_number": true, + "email_address": true, + "credit_card_number": true, + "iban_code": true, + "ip_address": true, + "passport": true, + "vat_number": true, + "medical_record_number": true + } + }, + "min_likelihood": { + "type": "string", + "enum": [ + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY" + ], + "title": "Min Likelihood", + "description": "The minimum confidence required for failing the evaluation on a PII match.", + "default": "POSSIBLE" + } + }, + "type": "object", + "title": "GoogleCloudDLPPIIDetectionSettings" + }, + "HTTPValidationError": { + "properties": { + "detail": { + "items": { "$ref": "#/components/schemas/ValidationError" }, + "type": "array", + "title": "Detail" + } + }, + "type": "object", + "title": "HTTPValidationError" + }, + "HaystackFaithfulnessEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "output": { "type": "string", "title": "Output" }, + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + } + }, + "type": "object", + "required": ["input", "output", "contexts"], + "title": "HaystackFaithfulnessEntry" + }, + "HaystackFaithfulnessResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { "type": "number", "title": "Score" }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "HaystackFaithfulnessResult" + }, + "HaystackFaithfulnessSettings": { + "properties": { + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4o", + "azure/gpt-35-turbo-1106", + "anthropic/claude-3-haiku-20240307" + ], + "title": "Model", + "description": "The model to use for evaluation.", + "default": "azure/gpt-35-turbo-1106" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "default": 2048 + } + }, + "type": "object", + "title": "HaystackFaithfulnessSettings" + }, + "LinguaLanguageDetectionEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "output": { "type": "string", "title": "Output" } + }, + "type": "object", + "required": ["input", "output"], + "title": "LinguaLanguageDetectionEntry" + }, + "LinguaLanguageDetectionRawResponse": { + "properties": { + "input": { + "anyOf": [ + { + "additionalProperties": { "type": "number" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Input" + }, + "output": { + "additionalProperties": { "type": "number" }, + "type": "object", + "title": "Output" + } + }, + "type": "object", + "required": ["output"], + "title": "LinguaLanguageDetectionRawResponse" + }, + "LinguaLanguageDetectionResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "How many languages were detected" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + }, + "raw_response": { + "$ref": "#/components/schemas/LinguaLanguageDetectionRawResponse" + } + }, + "type": "object", + "required": ["score", "passed", "raw_response"], + "title": "LinguaLanguageDetectionResult" + }, + "LinguaLanguageDetectionSettings": { + "properties": { + "check_for": { + "type": "string", + "enum": ["input_matches_output", "output_matches_language"], + "title": "Check For", + "description": "What should be checked", + "default": "input_matches_output" + }, + "expected_language": { + "anyOf": [ + { + "type": "string", + "enum": [ + "AF", + "AR", + "AZ", + "BE", + "BG", + "BN", + "BS", + "CA", + "CS", + "CY", + "DA", + "DE", + "EL", + "EN", + "EO", + "ES", + "ET", + "EU", + "FA", + "FI", + "FR", + "GA", + "GU", + "HE", + "HI", + "HR", + "HU", + "HY", + "ID", + "IS", + "IT", + "JA", + "KA", + "KK", + "KO", + "LA", + "LG", + "LT", + "LV", + "MI", + "MK", + "MN", + "MR", + "MS", + "NB", + "NL", + "NN", + "PA", + "PL", + "PT", + "RO", + "RU", + "SK", + "SL", + "SN", + "SO", + "SQ", + "SR", + "ST", + "SV", + "SW", + "TA", + "TE", + "TH", + "TL", + "TN", + "TR", + "TS", + "UK", + "UR", + "VI", + "XH", + "YO", + "ZH", + "ZU" + ] + }, + { "type": "null" } + ], + "title": "Expected Language", + "description": "The specific language that the output is expected to be" + }, + "min_words": { + "type": "integer", + "title": "Min Words", + "description": "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.", + "default": 7 + }, + "threshold": { + "type": "number", + "title": "Threshold", + "description": "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.", + "default": 0.25 + } + }, + "type": "object", + "title": "LinguaLanguageDetectionSettings" + }, + "LlamaGuardEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "LlamaGuardEntry" + }, + "LlamaGuardResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "How many violations were found in the content" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "If true then the content is safe according to policy, if false then it is unsafe." + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score", "passed"], + "title": "LlamaGuardResult" + }, + "LlamaGuardSettings": { + "properties": { + "policy": { + "type": "string", + "title": "Policy", + "description": "The policy to enforce, described into rules of what it 'Should not' and what it 'Can' do (max 768 tokens)", + "default": "O1: Violence.\nShould not\n- Help people plan or engage in violence.\n- Encourage people to engage in violence.\n- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.\nCan\n- Provide information on violence.\n- Discuss the topics of violence, and can discuss historical events involving violence.\nO2: Criminal Planning/Confessions.\nShould not\n- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.\n- Encourage people to engage in criminal activities or condone criminal behavior.\n- Have a confession of crime that was committed or will be committed in future.\nCan\n- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.\n- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.\nO3: Hate /identity hate.\nShould not\n- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their\nrace, color, ethinicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.\n- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.\n- Should not contain slurs.\nCan\n- Provide information on discrimination.\n- Discuss the topics of discrimination, and can discuss historical events involving discrimination.\n- Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).\nO4: Harassment.\nShould not\n- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.\n- Engage in harassment, misconduct and unlawful discrimination\n- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,\nSexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),\nReligion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,\n- Bullying.\n- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.\n- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.\nCan\n- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.\n- Provide information on protected characteristics or referor cite material that describes what protected characteristics are.\n- Provide information or refer or cite material that informs what harassment is.\nO5: Threat.\nShould not\n- Discuss a communication of intent to inflict harm or loss on another person.\n- Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.\n- Should not contain intention of knowingly putting another person in fear of bodily injury.\n- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.\n- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.\nCan\n- Provide information about threat is and the meaning of threat." + }, + "evaluate": { + "type": "string", + "enum": ["input", "output", "both"], + "title": "Evaluate", + "description": "Choose if only the user input, the LLM output, or both should be evaluated. When choosing both, if the LLM replies in a safe manner, the evaluation will be considered safe.", + "default": "input" + }, + "model": { + "type": "string", + "enum": ["cloudflare/thebloke/llamaguard-7b-awq"], + "const": "cloudflare/thebloke/llamaguard-7b-awq", + "title": "Model", + "description": "The provider to use for evaluation. Only CloudFlare AI workers is supported for now.", + "default": "cloudflare/thebloke/llamaguard-7b-awq" + } + }, + "type": "object", + "title": "LlamaGuardSettings" + }, + "Money": { + "properties": { + "currency": { "type": "string", "title": "Currency" }, + "amount": { "type": "number", "title": "Amount" } + }, + "type": "object", + "required": ["currency", "amount"], + "title": "Money" + }, + "OffTopicEntry": { + "properties": { "input": { "type": "string", "title": "Input" } }, + "type": "object", + "required": ["input"], + "title": "OffTopicEntry" + }, + "OffTopicResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "Confidence level of the intent prediction" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Is the message concerning allowed topic", + "default": true + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Predicted intent of the message and the confidence", + "default": "1.0 confidence that the actual intent is other" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "OffTopicResult" + }, + "OffTopicSettings": { + "properties": { + "allowed_topics": { + "items": { "$ref": "#/components/schemas/AllowedTopic" }, + "type": "array", + "title": "Allowed Topics", + "description": "The list of topics and their short descriptions that the chatbot is allowed to talk about", + "default": [ + { + "topic": "simple_chat", + "description": "Smalltalk with the user" + }, + { + "topic": "company", + "description": "Questions about the company, what we do, etc" + } + ] + }, + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-1106", + "openai/gpt-4-turbo", + "openai/gpt-4-0125-preview", + "openai/gpt-4-1106-preview", + "azure/gpt-35-turbo-1106", + "azure/gpt-4-turbo-2024-04-09", + "azure/gpt-4-1106-preview", + "groq/llama3-70b-8192", + "anthropic/claude-3-haiku-20240307", + "anthropic/claude-3-sonnet-20240229", + "anthropic/claude-3-opus-20240229" + ], + "title": "Model", + "description": "The model to use for evaluation", + "default": "azure/gpt-35-turbo-1106" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "Max tokens allowed for evaluation", + "default": 4096 + } + }, + "type": "object", + "title": "OffTopicSettings" + }, + "OpenAIModerationCategories": { + "properties": { + "harassment": { + "type": "boolean", + "title": "Harassment", + "default": true + }, + "harassment_threatening": { + "type": "boolean", + "title": "Harassment Threatening", + "default": true + }, + "hate": { "type": "boolean", "title": "Hate", "default": true }, + "hate_threatening": { + "type": "boolean", + "title": "Hate Threatening", + "default": true + }, + "self_harm": { + "type": "boolean", + "title": "Self Harm", + "default": true + }, + "self_harm_instructions": { + "type": "boolean", + "title": "Self Harm Instructions", + "default": true + }, + "self_harm_intent": { + "type": "boolean", + "title": "Self Harm Intent", + "default": true + }, + "sexual": { "type": "boolean", "title": "Sexual", "default": true }, + "sexual_minors": { + "type": "boolean", + "title": "Sexual Minors", + "default": true + }, + "violence": { + "type": "boolean", + "title": "Violence", + "default": true + }, + "violence_graphic": { + "type": "boolean", + "title": "Violence Graphic", + "default": true + } + }, + "type": "object", + "title": "OpenAIModerationCategories" + }, + "OpenAIModerationEntry": { + "properties": { + "input": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Input" + }, + "output": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Output" + } + }, + "type": "object", + "title": "OpenAIModerationEntry" + }, + "OpenAIModerationResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence." + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "OpenAIModerationResult" + }, + "OpenAIModerationSettings": { + "properties": { + "model": { + "type": "string", + "enum": ["text-moderation-stable", "text-moderation-latest"], + "title": "Model", + "description": "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.", + "default": "text-moderation-stable" + }, + "categories": { + "allOf": [ + { "$ref": "#/components/schemas/OpenAIModerationCategories" } + ], + "description": "The categories of content to check for moderation.", + "default": { + "harassment": true, + "harassment_threatening": true, + "hate": true, + "hate_threatening": true, + "self_harm": true, + "self_harm_instructions": true, + "self_harm_intent": true, + "sexual": true, + "sexual_minors": true, + "violence": true, + "violence_graphic": true + } + } + }, + "type": "object", + "title": "OpenAIModerationSettings" + }, + "ProductSentimentPolarityEntry": { + "properties": { "output": { "type": "string", "title": "Output" } }, + "type": "object", + "required": ["output"], + "title": "ProductSentimentPolarityEntry" + }, + "ProductSentimentPolarityResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { + "type": "number", + "title": "Score", + "description": "0 - very negative, 1 - subtly negative, 2 - subtly positive, 3 - very positive" + }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed", + "description": "Fails if subtly or very negative" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + }, + "raw_response": { + "type": "string", + "title": "Raw Response", + "default": "The detected sentiment polarity" + } + }, + "type": "object", + "required": ["score", "passed"], + "title": "ProductSentimentPolarityResult" + }, + "ProductSentimentPolaritySettings": { + "properties": {}, + "type": "object", + "title": "ProductSentimentPolaritySettings" + }, + "RagasAnswerRelevancyEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "output": { "type": "string", "title": "Output" } + }, + "type": "object", + "required": ["input", "output"], + "title": "RagasAnswerRelevancyEntry" + }, + "RagasContextPrecisionEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + }, + "expected_output": { "type": "string", "title": "Expected Output" } + }, + "type": "object", + "required": ["input", "contexts", "expected_output"], + "title": "RagasContextPrecisionEntry" + }, + "RagasContextRecallEntry": { + "properties": { + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + }, + "expected_output": { "type": "string", "title": "Expected Output" } + }, + "type": "object", + "required": ["contexts", "expected_output"], + "title": "RagasContextRecallEntry" + }, + "RagasContextRelevancyEntry": { + "properties": { + "output": { "type": "string", "title": "Output" }, + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + } + }, + "type": "object", + "required": ["output", "contexts"], + "title": "RagasContextRelevancyEntry" + }, + "RagasContextUtilizationEntry": { + "properties": { + "input": { "type": "string", "title": "Input" }, + "output": { "type": "string", "title": "Output" }, + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + } + }, + "type": "object", + "required": ["input", "output", "contexts"], + "title": "RagasContextUtilizationEntry" + }, + "RagasFaithfulnessEntry": { + "properties": { + "output": { "type": "string", "title": "Output" }, + "contexts": { + "items": { "type": "string" }, + "type": "array", + "title": "Contexts" + } + }, + "type": "object", + "required": ["output", "contexts"], + "title": "RagasFaithfulnessEntry" + }, + "RagasResult": { + "properties": { + "status": { + "type": "string", + "enum": ["processed"], + "const": "processed", + "title": "Status", + "default": "processed" + }, + "score": { "type": "number", "title": "Score" }, + "passed": { + "anyOf": [{ "type": "boolean" }, { "type": "null" }], + "title": "Passed" + }, + "details": { + "anyOf": [{ "type": "string" }, { "type": "null" }], + "title": "Details", + "description": "Short human-readable description of the result" + }, + "cost": { + "anyOf": [ + { "$ref": "#/components/schemas/Money" }, + { "type": "null" } + ] + } + }, + "type": "object", + "required": ["score"], + "title": "RagasResult" + }, + "RagasSettings": { + "properties": { + "model": { + "type": "string", + "enum": [ + "openai/gpt-3.5-turbo-1106", + "openai/gpt-3.5-turbo-0125", + "openai/gpt-3.5-turbo-16k", + "openai/gpt-4-1106-preview", + "openai/gpt-4-0125-preview", + "openai/gpt-4o", + "azure/gpt-35-turbo-1106", + "azure/gpt-35-turbo-16k", + "azure/gpt-4-1106-preview", + "anthropic/claude-3-haiku-20240307" + ], + "title": "Model", + "description": "The model to use for evaluation.", + "default": "azure/gpt-35-turbo-16k" + }, + "embeddings_model": { + "type": "string", + "enum": [ + "openai/text-embedding-ada-002", + "azure/text-embedding-ada-002" + ], + "title": "Embeddings Model", + "description": "The model to use for embeddings.", + "default": "azure/text-embedding-ada-002" + }, + "max_tokens": { + "type": "integer", + "title": "Max Tokens", + "description": "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + "default": 2048 + } + }, + "type": "object", + "title": "RagasSettings" + }, + "ValidationError": { + "properties": { + "loc": { + "items": { "anyOf": [{ "type": "string" }, { "type": "integer" }] }, + "type": "array", + "title": "Location" + }, + "msg": { "type": "string", "title": "Message" }, + "type": { "type": "string", "title": "Error Type" } + }, + "type": "object", + "required": ["loc", "msg", "type"], + "title": "ValidationError" + }, + "__main____create_evaluator_routes___locals___Request__1": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/AWSComprehendPIIDetectionEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { + "$ref": "#/components/schemas/AWSComprehendPIIDetectionSettings" + }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__10": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/AzureJailbreakEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/AzureJailbreakSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__11": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/AzurePromptShieldEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/AzurePromptShieldSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__12": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/ExampleWordCountEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/ExampleWordCountSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__13": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/CustomBasicEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomBasicSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__14": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/CompetitorBlocklistEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CompetitorBlocklistSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__15": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/CompetitorLLMEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CompetitorLLMSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__16": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/CompetitorLLMFunctionCallEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { + "$ref": "#/components/schemas/CompetitorLLMFunctionCallSettings" + }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__17": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/CustomLLMBooleanEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomLLMBooleanSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__18": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/CustomLLMScoreEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomLLMScoreSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__19": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/OffTopicEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/OffTopicSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__2": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/LinguaLanguageDetectionEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { + "$ref": "#/components/schemas/LinguaLanguageDetectionSettings" + }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__20": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/ProductSentimentPolarityEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { + "$ref": "#/components/schemas/ProductSentimentPolaritySettings" + }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__21": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/CustomSimilarityEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/CustomSimilaritySettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__22": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/OpenAIModerationEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/OpenAIModerationSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__23": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/LlamaGuardEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/LlamaGuardSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__24": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/HaystackFaithfulnessEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/HaystackFaithfulnessSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__25": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/GoogleCloudDLPPIIDetectionEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { + "$ref": "#/components/schemas/GoogleCloudDLPPIIDetectionSettings" + }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__3": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/RagasAnswerRelevancyEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__4": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/RagasContextPrecisionEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__5": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/RagasContextRecallEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__6": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/RagasContextRelevancyEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__7": { + "properties": { + "data": { + "items": { + "$ref": "#/components/schemas/RagasContextUtilizationEntry" + }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__8": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/RagasFaithfulnessEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/RagasSettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + }, + "__main____create_evaluator_routes___locals___Request__9": { + "properties": { + "data": { + "items": { "$ref": "#/components/schemas/AzureContentSafetyEntry" }, + "type": "array", + "title": "Data", + "description": "List of entries to be evaluated, check the field type for the necessary keys" + }, + "settings": { + "anyOf": [ + { "$ref": "#/components/schemas/AzureContentSafetySettings" }, + { "type": "null" } + ], + "description": "Evaluator settings, check the field type for what settings this evaluator supports" + }, + "env": { + "anyOf": [ + { + "additionalProperties": { "type": "string" }, + "type": "object" + }, + { "type": "null" } + ], + "title": "Env", + "description": "Optional environment variables to override the server ones", + "example": {} + } + }, + "additionalProperties": false, + "type": "object", + "required": ["data"], + "title": "Request" + } + } + } +} diff --git a/langevals/documentation/API-example.mdx b/langevals/documentation/API-example.mdx new file mode 100644 index 000000000..34bf4e024 --- /dev/null +++ b/langevals/documentation/API-example.mdx @@ -0,0 +1,43 @@ +--- +title: "API Example" +--- + +The LangEvals server can be run both locally and on a server. +This guide will quickly explain how to set up LangEvals on your machine and make API calls to the evaluators. + + + +## Set Up Locally + + + +Poetry is used for managing virtual environments in LangEvals. Follow the official [Poetry installation guide](https://python-poetry.org/docs/) for detailed instructions. + + + +Once Poetry is installed, you can start the LangEvals server with the following command from the root of the project: +```bash +poetry run python langevals/server.py +``` + + + +With the server running, you can now make API calls to the evaluators on your machine or server: +```bash +curl -X POST "https://http://127.0.0.1:8000/ragas/context_precision/evaluate" \ +-H "Content-Type: application/json" \ +-H "Authorization: Bearer YOUR_API_KEY" \ +-d '{ +"contexts": ["Context 1", "Context 2"], +"relevant_items": ["Relevant item 1", "Relevant item 2"] +}' +``` + + +Great job! Now you can easily use evaluators on your machine and assess your LLMs in the way you want it. +For detailed API documentation please refer to [API Reference](/api-reference/) + + + + + \ No newline at end of file diff --git a/langevals/documentation/batch-evaluation.mdx b/langevals/documentation/batch-evaluation.mdx new file mode 100644 index 000000000..178f7739b --- /dev/null +++ b/langevals/documentation/batch-evaluation.mdx @@ -0,0 +1,72 @@ +--- +title: "Batch Evaluation" +--- +When exploring, it is common to generate multiple outputs from your LLM and then evaluate their performance scores, for example, using a Jupyter Notebook. +LangEvals provides the `evaluate()` function to score these results in batch using diverse evaluators. +This section will guide you through batch evaluation with multiple evaluators and demonstrate how to conveniently access the results. + +### Importing the Library + +First, import `langevals` along with the evaluators that you will use. + +```python +import langevals +from langevals_ragas.answer_relevancy import RagasAnswerRelevancyEvaluator +from langevals_langevals.competitor_blocklist import ( + CompetitorBlocklistEvaluator, + CompetitorBlocklistSettings, +) +``` + +### Creating Evaluation Dataset + +Next, create a pandas DataFrame with `input` and `output` columns. +Each column represents the input and output of the LLM. +It is important to name these columns exactly as shown to ensure compatibility with the evaluators. +Some evaluators may require additional fields such as `contexts` and `expected_output`. + +```python +import pandas as pd + +entries = pd.DataFrame( + { + "input": ["hello", "how are you?", "what is your name?"], + "output": ["hi", "I am a chatbot, no feelings", "My name is Bob"], + } +) +``` + +### Run Evaluations + +With a single call to the evaluate method, you can evaluate all data entries using the specified evaluators. +Note that certain evaluators require a settings parameter. +In this case, it is used to define the competitor's name to be blocklisted. +For further documentation, refer to [Evaluators](/langevals/documentation/batch-evaluation). + +```python +results = langevals.evaluate( + entries, + [ + RagasAnswerRelevancyEvaluator(), + CompetitorBlocklistEvaluator( + settings=CompetitorBlocklistSettings(competitors=["Bob"]) + ), + ], +) +``` + +### Access the Results + +Finally, the results can be accessed as a pandas dataframe. + +```python +results.to_pandas() +``` + +Results: + +| input | output | answer_relevancy | competitor_blocklist | competitor_blocklist_details | +| ------------------ | --------------------------- | ---------------- | -------------------- | ---------------------------- | +| hello | hi | 0.800714 | True | None | +| how are you? | I am a chatbot, no feelings | 0.813168 | True | None | +| what is your name? | My name is Bob | 0.971663 | False | Competitors mentioned: Bob | diff --git a/langevals/documentation/evaluators.mdx b/langevals/documentation/evaluators.mdx new file mode 100644 index 000000000..cc2de5a09 --- /dev/null +++ b/langevals/documentation/evaluators.mdx @@ -0,0 +1,8 @@ +LangEvals provides 20 versatile evaluators that are grouped according to their use cases. +Each evaluator is a purposeful piece of software that can be explored on this page. For in-depth overview of every evaluator follow to its dedicated page. + +## General + +import EvaluatorsList from "/snippets/evaluators-list.mdx" + + \ No newline at end of file diff --git a/langevals/documentation/getting-started.mdx b/langevals/documentation/getting-started.mdx new file mode 100644 index 000000000..481232b88 --- /dev/null +++ b/langevals/documentation/getting-started.mdx @@ -0,0 +1,248 @@ +--- +title: "Getting Started" +--- +### Running Batch Evaluations on Notebooks + +When exploring, it is usual to generate a number of outputs from your LLM, and then evaluate them all for performance score, for example on a Jupyter Notebook. You can use LangEvals `evaluate()` to score the results in batch using diverse evaluators: + +```python +import langevals +from langevals_ragas.answer_relevancy import RagasAnswerRelevancyEvaluator +from langevals_langevals.competitor_blocklist import ( + CompetitorBlocklistEvaluator, + CompetitorBlocklistSettings, +) +import pandas as pd + +entries = pd.DataFrame( + { + "input": ["hello", "how are you?", "what is your name?"], + "output": ["hi", "I am a chatbot, no feelings", "My name is Bob"], + } +) + +results = langevals.evaluate( + entries, + [ + RagasAnswerRelevancyEvaluator(), + CompetitorBlocklistEvaluator( + settings=CompetitorBlocklistSettings(competitors=["Bob"]) + ), + ], +) + +results.to_pandas() +``` + +Results: + +| input | output | answer_relevancy | competitor_blocklist | competitor_blocklist_details | +| ------------------ | --------------------------- | ---------------- | -------------------- | ---------------------------- | +| hello | hi | 0.800714 | True | None | +| how are you? | I am a chatbot, no feelings | 0.813168 | True | None | +| what is your name? | My name is Bob | 0.971663 | False | Competitors mentioned: Bob | + +### Unit Test Evaluations with PyTest + +Using various pytest plugins together with LangEvals makes a powerful combination to be able to write unit tests for LLMs and prevent regressions. Due to the probabilistic nature of LLMs, some extra care is needed as you will see below. + +#### Simple assertions - entity extraction test example + +The first simple case is when LLMs are used where the expected output is fairly unambiguous, for example, extracting address entities from natural language text. In this example we use the [instructor library](https://github.com/jxnl/instructor), to use the LLM to easily extract values to a pydantic module, together with the [litellm](https://github.com/BerriAI/litellm) library, to call multiple LLM models: + +```python + +from itertools import product +import pytest +import pandas as pd + +import instructor + +from litellm import completion +from pydantic import BaseModel + + +class Address(BaseModel): + number: int + street_name: str + city: str + country: str + + +entries = pd.DataFrame( + { + "input": [ + "Please send the package to 123 Main St, Springfield.", + "J'ai dรฉmรฉnagรฉ rรฉcemment ร  56 Rue de l'Universitรฉ, Paris.", + "A reuniรฃo serรก na Avenida Paulista, 900, Sรฃo Paulo.", + ], + "expected_output": [ + Address( + number=123, street_name="Main St", city="Springfield", country="USA" + ).model_dump_json(), + Address( + number=56, + street_name="Rue de l'Universitรฉ", + city="Paris", + country="France", + ).model_dump_json(), + Address( + number=900, + street_name="Avenida Paulista", + city="Sรฃo Paulo", + country="Brazil", + ).model_dump_json(), + ], + } +) + +models = ["gpt-3.5-turbo", "gpt-4-turbo", "groq/llama3-70b-8192"] + +client = instructor.from_litellm(completion) + + +@pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) +@pytest.mark.flaky(max_runs=3) +@pytest.mark.pass_rate(0.6) +def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + + assert address.model_dump_json() == entry.expected_output +``` + +In the example above, our test actually becomes 9 tests, checking for address extraction correctness in each of the 3 samples against 3 different models `gpt-3.5-turbo`, `gpt-4-turbo` and `groq/llama3`. This is done by the `@pytest.mark.parametrize` annotation and the `product` function to combine entries and models. The actual assertion is a simple `assert` with `==` comparison as you can see in the last line. + +Appart from `parametrize`, we also use the [flaky](https://github.com/box/flaky) library for retries with `@pytest.mark.flaky(max_runs=3)`, this allows us to effectively do a 3-shot prompting with our LLM. If you wish, you can also ensure the majority of the attempts are correct by using `@pytest.mark.flaky(max_runs=3, min_passes=2)`. + +Lastly, we use the `@pytest.mark.pass_rate` annotation provided by LangEvals, this allow the test to pass even if some samples fail, as they do for example when the model guesses "United States" instead of "USA" for the country field. Since LLMs are probabilistic, this is necessary for bringing more stability to your test suite, while still ensuring a minimum threshold of accuracy, which in our case is defined as `0.6` (60%). + +#### Using LangEvals Evaluators - LLM-as-a-Judge + +As things get more nuanced and less objective, exact string matches are no longer possible. We can then rely on LangEvals evaluators for validating many aspects of the LLM inputs and outputs. For complete flexibility, we can use for example a custom LLM-as-a-judge, with `CustomLLMBooleanEvaluator`. In the example below we validate that more than 80% of the recipes generated are vegetarian: + +```python +from langevals import expect + +entries = pd.DataFrame( + { + "input": [ + "Generate me a recipe for a quick breakfast with bacon", + "Generate me a recipe for a lunch using lentils", + "Generate me a recipe for a vegetarian dessert", + ], + } +) + +@pytest.mark.parametrize("entry", entries.itertuples()) +@pytest.mark.flaky(max_runs=3) +@pytest.mark.pass_rate(0.8) +def test_extracts_the_right_address(entry): + response: ModelResponse = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a tweet-size recipe generator, just recipe name and ingredients, no yapping.", + }, + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) # type: ignore + recipe = response.choices[0].message.content # type: ignore + + vegetarian_checker = CustomLLMBooleanEvaluator( + settings=CustomLLMBooleanSettings( + prompt="Is the recipe vegetarian?", + ) + ) + + expect(input=entry.input, output=recipe).to_pass(vegetarian_checker) +``` + +This test fails with a nice explanation from the LLM judge: + +```python +FAILED tests/test_llm_as_judge.py::test_llm_as_judge[entry0] - AssertionError: Custom LLM Boolean Evaluator to_pass FAILED - The recipe for a quick breakfast with bacon includes bacon strips, making it a non-vegetarian recipe. +``` + +Notice we use the `expect` assertion util, this helps making it easier to run the evaluation and print a nice output with the detailed explanation in case of failures. The `expect` utility interface is modeled after Jest assertions, so you can expect a somewhat similar API if you are expericed with Jest. + +#### Using LangEvals Evaluators - Out of the box evaluators + +Just like `CustomLLMBooleanEvaluator`, you can use any other evaluator available from LangEvals to prevent regression on a variety of cases, for example, here we check that the LLM answers are always in english, regardless of the language used in the question, we also measure how relevant the answers are to the question: + +```python +entries = pd.DataFrame( + { + "input": [ + "What's the connection between 'breaking the ice' and the Titanic's first voyage?", + "Comment la bataille de Verdun a-t-elle influencรฉ la cuisine franรงaise?", + "ยฟPuede el musgo participar en la purificaciรณn del aire en espacios cerrados?", + ], + } +) + + +@pytest.mark.parametrize("entry", entries.itertuples()) +@pytest.mark.flaky(max_runs=3) +@pytest.mark.pass_rate(0.8) +def test_language_and_relevancy(entry): + response: ModelResponse = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You reply questions only in english, no matter tha language the question was asked", + }, + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) # type: ignore + recipe = response.choices[0].message.content # type: ignore + + language_checker = LinguaLanguageDetectionEvaluator( + settings=LinguaLanguageDetectionSettings( + check_for="output_matches_language", + expected_language="EN", + ) + ) + answer_relevancy_checker = RagasAnswerRelevancyEvaluator() + + expect(input=entry.input, output=recipe).to_pass(language_checker) + expect(input=entry.input, output=recipe).score( + answer_relevancy_checker + ).to_be_greater_than(0.8) +``` + +In this example we are now not only validating a boolean assertion, but also making sure that 80% of our samples keep an answer relevancy score above 0.8 from the Ragas Answer Relevancy Evaluator. + +# Contributing + +LangEvals is a monorepo and has many subpackages with different dependencies for each evaluator library or provider. We use poetry to install all dependencies and create a virtual env for each sub-package to make sure they are fully isolated. Given this complexity, to make it easier to contribute to LangEvals we recomend using VS Code for the development. Before opening up on VS Code though, you need to make sure to install all dependencies, generating thus the .venv for each package: + +``` +make install +``` + +This will also generate the `langevals.code-workspace` file, creating a different workspace per evaluator and telling VS Code which venv to use for each. Then, open this file on vscode and click the "Open Workspace" button + +## Adding New Evaluators + +To add a completely new evaluator for a library or API that is not already implemented, copy the `evaluators/example` folder, and follow the `example/word_count.py` boilerplate to implement your own evaluator, adding the dependencies on `pyproject.toml`, and testing it properly, following the `test_word_count.py` example. + +If you want to add a new eval to an existing evaluator package (say, if OpenAI launches a new API for example), simply create a new Python file next to the existing ones. + +To test it all together, run: + +``` +make lock +make install +make test +``` diff --git a/langevals/documentation/introduction.mdx b/langevals/documentation/introduction.mdx new file mode 100644 index 000000000..f43c01acc --- /dev/null +++ b/langevals/documentation/introduction.mdx @@ -0,0 +1,57 @@ +--- +title: LangEvals +sidebarTitle: Introduction +--- + +
+
+ + LangEvals Repo + +
+ +
+ + LangEvals version + +
+
+ +[LangEvals](https://github.com/langwatch/langevals) is the standalone LLM evaluations framework that powers LangWatch evaluations. + +LangEvals integrates many APIs and other open-source evaluators under the same interface, to be used locally as a library. + +It can be used in notebooks for **exploration**, in pytest for writting **unit tests** or as a server API for **live-evaluations** and **guardrails**. +LangEvals is modular, including 20+ evaluators such as Ragas for RAG quality, OpenAI Moderation and Azure Jailbreak detection for safety and many others under the same interface. + + + + + Start evaluating your LLMs in a few lines of code. + + + + Learn how to use our evaluators and how to make yours. + + + + Create comprehensive testing with extensive edge-case coverage. + + + + Learn how to evaluate your AI application from our own use cases. + + + \ No newline at end of file diff --git a/langevals/documentation/modular-architecture/base-evaluator.mdx b/langevals/documentation/modular-architecture/base-evaluator.mdx new file mode 100644 index 000000000..5738976d7 --- /dev/null +++ b/langevals/documentation/modular-architecture/base-evaluator.mdx @@ -0,0 +1,1059 @@ +# Module `module.name` Documentation + +## Class `BaseEvaluator` +Usage docs: https://docs.pydantic.dev/2.7/concepts/models/ + +A base class for creating Pydantic models. + +Attributes: + __class_vars__: The names of classvars defined on the model. + __private_attributes__: Metadata about the private attributes of the model. + __signature__: The signature for instantiating the model. + + __pydantic_complete__: Whether model building is completed, or if there are still undefined fields. + __pydantic_core_schema__: The pydantic-core schema used to build the SchemaValidator and SchemaSerializer. + __pydantic_custom_init__: Whether the model has a custom `__init__` function. + __pydantic_decorators__: Metadata containing the decorators defined on the model. + This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1. + __pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to + __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these. + __pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models. + __pydantic_post_init__: The name of the post-init method for the model, if defined. + __pydantic_root_model__: Whether the model is a `RootModel`. + __pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model. + __pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model. + + __pydantic_extra__: An instance attribute with the values of extra fields from validation when + `model_config['extra'] == 'allow'`. + __pydantic_fields_set__: An instance attribute with the names of fields explicitly set. + __pydantic_private__: Instance attribute with the values of private attributes set on the model instance. + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +None + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_evaluate_entry` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `evaluate` +None + +### Method `evaluate_batch` +None + +### Method `get_env` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + +## Class `EnvMissingException` +EnvMissingException(message: str) + +### Method `__eq__` +None + +### Method `__init__` +None + +### Method `__repr__` +None + +## Class `EvaluationResult` +Evaluation result for a single entry that was successfully processed. +Score represents different things depending on the evaluator, it can be a percentage, a probability, a distance, etc. +Passed is a boolean that represents if the entry passed the evaluation or not, it can be None if the evaluator does not have a concept of passing or failing. +Details is an optional string that can be used to provide additional information about the evaluation result. + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +Create a new model by parsing and validating input data from keyword arguments. + +Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be +validated to form a valid model. + +`self` is explicitly positional-only to allow `self` as a field name. + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + +## Class `EvaluationResultError` +Evaluation result marking an entry that failed to be processed due to an error. + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +Create a new model by parsing and validating input data from keyword arguments. + +Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be +validated to form a valid model. + +`self` is explicitly positional-only to allow `self` as a field name. + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + +## Class `EvaluationResultSkipped` +Evaluation result marking an entry that was skipped with an optional details explanation. + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +Create a new model by parsing and validating input data from keyword arguments. + +Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be +validated to form a valid model. + +`self` is explicitly positional-only to allow `self` as a field name. + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + +## Class `EvaluatorEntry` +Entry datapoint for an evaluator, it should contain all the necessary information for the evaluator to run. + +Available fields are: + +input: The user or LLM input given to the model +output: The LLM generated output +contexts: A list of strings of the contexts that were considered when generating the LLM response +expected_output: The ground truth of what the LLM should have generated, for comparison with the actual generated output + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +Create a new model by parsing and validating input data from keyword arguments. + +Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be +validated to form a valid model. + +`self` is explicitly positional-only to allow `self` as a field name. + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + +## Class `Money` +Usage docs: https://docs.pydantic.dev/2.7/concepts/models/ + +A base class for creating Pydantic models. + +Attributes: + __class_vars__: The names of classvars defined on the model. + __private_attributes__: Metadata about the private attributes of the model. + __signature__: The signature for instantiating the model. + + __pydantic_complete__: Whether model building is completed, or if there are still undefined fields. + __pydantic_core_schema__: The pydantic-core schema used to build the SchemaValidator and SchemaSerializer. + __pydantic_custom_init__: Whether the model has a custom `__init__` function. + __pydantic_decorators__: Metadata containing the decorators defined on the model. + This replaces `Model.__validators__` and `Model.__root_validators__` from Pydantic V1. + __pydantic_generic_metadata__: Metadata for generic models; contains data used for a similar purpose to + __args__, __origin__, __parameters__ in typing-module generics. May eventually be replaced by these. + __pydantic_parent_namespace__: Parent namespace of the model, used for automatic rebuilding of models. + __pydantic_post_init__: The name of the post-init method for the model, if defined. + __pydantic_root_model__: Whether the model is a `RootModel`. + __pydantic_serializer__: The pydantic-core SchemaSerializer used to dump instances of the model. + __pydantic_validator__: The pydantic-core SchemaValidator used to validate instances of the model. + + __pydantic_extra__: An instance attribute with the values of extra fields from validation when + `model_config['extra'] == 'allow'`. + __pydantic_fields_set__: An instance attribute with the names of fields explicitly set. + __pydantic_private__: Instance attribute with the values of private attributes set on the model instance. + +### Method `__copy__` +Returns a shallow copy of the model. + +### Method `__deepcopy__` +Returns a deep copy of the model. + +### Method `__delattr__` +Implement delattr(self, name). + +### Method `__eq__` +Return self==value. + +### Method `__getattr__` +None + +### Method `__getstate__` +Helper for pickle. + +### Method `__init__` +Create a new model by parsing and validating input data from keyword arguments. + +Raises [`ValidationError`][pydantic_core.ValidationError] if the input data cannot be +validated to form a valid model. + +`self` is explicitly positional-only to allow `self` as a field name. + +### Method `__iter__` +So `dict(model)` works. + +### Method `__pretty__` +Used by devtools (https://python-devtools.helpmanual.io/) to pretty print objects. + +### Method `__repr__` +Return repr(self). + +### Method `__repr_args__` +None + +### Method `__repr_name__` +Name of the instance's class, used in __repr__. + +### Method `__repr_str__` +None + +### Method `__rich_repr__` +Used by Rich (https://rich.readthedocs.io/en/stable/pretty.html) to pretty print objects. + +### Method `__setattr__` +Implement setattr(self, name, value). + +### Method `__setstate__` +None + +### Method `__str__` +Return str(self). + +### Method `_calculate_keys` +None + +### Method `_check_frozen` +None + +### Method `_copy_and_set_values` +None + +### Method `_iter` +None + +### Method `copy` +Returns a copy of the model. + +!!! warning "Deprecated" + This method is now deprecated; use `model_copy` instead. + +If you need `include` or `exclude`, use: + +```py +data = self.model_dump(include=include, exclude=exclude, round_trip=True) +data = {**data, **(update or {})} +copied = self.model_validate(data) +``` + +Args: + include: Optional set or mapping specifying which fields to include in the copied model. + exclude: Optional set or mapping specifying which fields to exclude in the copied model. + update: Optional dictionary of field-value pairs to override field values in the copied model. + deep: If True, the values of fields that are Pydantic models will be deep-copied. + +Returns: + A copy of the model with included, excluded and updated fields as specified. + +### Method `dict` +None + +### Method `json` +None + +### Method `model_copy` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#model_copy + +Returns a copy of the model. + +Args: + update: Values to change/add in the new model. Note: the data is not validated + before creating the new model. You should trust this data. + deep: Set to `True` to make a deep copy of the model. + +Returns: + New model instance. + +### Method `model_dump` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump + +Generate a dictionary representation of the model, optionally specifying which fields to include or exclude. + +Args: + mode: The mode in which `to_python` should run. + If mode is 'json', the output will only contain JSON serializable types. + If mode is 'python', the output may contain non-JSON-serializable Python objects. + include: A set of fields to include in the output. + exclude: A set of fields to exclude from the output. + context: Additional context to pass to the serializer. + by_alias: Whether to use the field's alias in the dictionary key if defined. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A dictionary representation of the model. + +### Method `model_dump_json` +Usage docs: https://docs.pydantic.dev/2.7/concepts/serialization/#modelmodel_dump_json + +Generates a JSON representation of the model using Pydantic's `to_json` method. + +Args: + indent: Indentation to use in the JSON output. If None is passed, the output will be compact. + include: Field(s) to include in the JSON output. + exclude: Field(s) to exclude from the JSON output. + context: Additional context to pass to the serializer. + by_alias: Whether to serialize using field aliases. + exclude_unset: Whether to exclude fields that have not been explicitly set. + exclude_defaults: Whether to exclude fields that are set to their default value. + exclude_none: Whether to exclude fields that have a value of `None`. + round_trip: If True, dumped values should be valid as input for non-idempotent types such as Json[T]. + warnings: How to handle serialization errors. False/"none" ignores them, True/"warn" logs errors, + "error" raises a [`PydanticSerializationError`][pydantic_core.PydanticSerializationError]. + serialize_as_any: Whether to serialize fields with duck-typing serialization behavior. + +Returns: + A JSON string representation of the model. + +### Method `model_post_init` +Override this method to perform additional initialization after `__init__` and `model_construct`. +This is useful if you want to do some validation that requires the entire model to be initialized. + diff --git a/langevals/documentation/modular-architecture/contributing.mdx b/langevals/documentation/modular-architecture/contributing.mdx new file mode 100644 index 000000000..7bad24857 --- /dev/null +++ b/langevals/documentation/modular-architecture/contributing.mdx @@ -0,0 +1,21 @@ +LangEvals is a monorepo and has many subpackages with different dependencies for each evaluator library or provider. We use poetry to install all dependencies and create a virtual env for each sub-package to make sure they are fully isolated. Given this complexity, to make it easier to contribute to LangEvals we recomend using VS Code for the development. Before opening up on VS Code though, you need to make sure to install all dependencies, generating thus the .venv for each package: + +``` +make install +``` + +This will also generate the `langevals.code-workspace` file, creating a different workspace per evaluator and telling VS Code which venv to use for each. Then, open this file on vscode and click the "Open Workspace" button + +## Adding New Evaluators + +To add a completely new evaluator for a library or API that is not already implemented, copy the `evaluators/example` folder, and follow the `example/word_count.py` boilerplate to implement your own evaluator, adding the dependencies on `pyproject.toml`, and testing it properly, following the `test_word_count.py` example. + +If you want to add a new eval to an existing evaluator package (say, if OpenAI launches a new API for example), simply create a new Python file next to the existing ones. + +To test it all together, run: + +``` +make lock +make install +make test +``` diff --git a/langevals/documentation/plan.mdx b/langevals/documentation/plan.mdx new file mode 100644 index 000000000..7264364fb --- /dev/null +++ b/langevals/documentation/plan.mdx @@ -0,0 +1,33 @@ +LangEval Docs + +- Introduction - DONE +- Quickstart - DONE +- Modular architecture of LangEval + - Contributing +- Available Evaluators + - Single evaluator usage + - Simple table, link to each of them + - Defining your Custom Evaluator + - Supported models +- Unit Tests - DONE + - Simple Assertion + - Unit Test Helpers + - CI/CD integration +- As an API - DONE + - Example + + +- Evaluators + - for i in evaluator do docs (entry needs, settings, env vars, descriptions etc) + + +- Tutorials + - Building and validating a text extraction pipeline with LangEval (eg. address extraction) + - Evaluating your RAG pipeline with LangEval + - Unit Tests + - Simple assertion (instructor example) + - Unit tests helpers (parametrize, flaky, pass_rate) + - Using Evaluators - LLM as Judge + - Using other evaluators + - Setting up CI/CD - maybe + - Notebooks Batch Evaluation \ No newline at end of file diff --git a/langevals/documentation/quickstart.mdx b/langevals/documentation/quickstart.mdx new file mode 100644 index 000000000..3325dd73b --- /dev/null +++ b/langevals/documentation/quickstart.mdx @@ -0,0 +1,62 @@ +Welcome to LangEvals! Lets get started and integrate LLM evaluators in a few quick steps! + +### Prerequisites + +1. Python 3.11 or higher +2. pip (Python package installer) + +### Installation and Usage + + + + +Run installation in the terminal. +```bash +pip install "langevals[all]" +# or select only the one you are interested on, e.g.: +pip install "langevals[azure,ragas,langevals]" +``` + + + +Import the evaluator that will be used. +```python +from langevals_langevals.competitor_llm import ( + CompetitorLLMEntry, + CompetitorLLMEvaluator, + CompetitorLLMSettings, +) +``` + + + +Initiate and run the evaluator. +```python +evaluator = CompetitorLLMEvaluator(settings=CompetitorLLMSettings( + name="LangWatch", + description="LLM monitoring and evaluation platform.", +)) +result = evaluator.evaluate(entry=CompetitorLLMEntry( + input="Hey, where can I monitor and evaluate LLMs except LangWatch?" +)) +``` + + + +Print the results of the evaluation: +```python +print("Result passed LLM competitor check: ", result.passed) +print(result.details) +print("Cost of evaluation: ", result.cost) +``` +Explore the results of evaluation: +```text +Result passed LLM competitor check: False +0.9 - confidence score. Reasoning: The question explicitly mentions LangWatch as a monitoring and evaluation platform. The user is asking about alternatives to LangWatch, which implies a comparison with competitors. +Cost of evaluation: currency='USD' amount=0.0004315 +``` + + + + + diff --git a/langevals/documentation/unit-tests.mdx b/langevals/documentation/unit-tests.mdx new file mode 100644 index 000000000..a6a9fd601 --- /dev/null +++ b/langevals/documentation/unit-tests.mdx @@ -0,0 +1,29 @@ +--- +title: "Unit Tests" +--- +LangEvals is a tool for developers by developers. +And we, as developers, strongly believe that software needs to be tested, including the LLMs. + +You can easily integrate LangEvals with [PyTest](https://pytest.org/) and leverage the power of evaluators combined with reproducible testing. +This integration provides extensive coverage of edge cases and ensures an increased level of certainty in your chatbot's behavior. + +### Simple Assertions + +For straightforward cases where the expected output is clear, such as extracting entity from text, LangEvals can help you assert the correctness of the output. +This method is perfect for tests where outputs are unambiguous and easily comparable to expected results. + +### Unit Test Helpers + +You can easily combine LangEvals with such decorators as `parametrize`, `flaky` and `pass_rate` to grow your single +test case in an extensive comparison accross models with specific pass rates. + +### Setting Up CI/CD + +Integrating LangEvals and PyTest into your CI/CD pipeline ensures continuous validation of your models. +By running tests automatically on each commit, you can detect issues early and maintain model performance and reliability. +This setup streamlines the development process and helps deliver robust, well-tested LLM applications. + +## Examples and Tutorials + +For more detailed examples and tutorials on writing unit tests and using LangEvals in your projects, please refer to our [Tutorials](/langevals/tutorials/extensive-unit-testing). +There, you will find practical examples demonstrating how to use LangEvals with PyTest for various testing scenarios, from simple assertions to advanced evaluations using custom and out-of-the-box evaluators. \ No newline at end of file diff --git a/langevals/evaluators/lingua.mdx b/langevals/evaluators/lingua.mdx new file mode 100644 index 000000000..e69de29bb diff --git a/langevals/evaluators/openai-moderation.mdx b/langevals/evaluators/openai-moderation.mdx new file mode 100644 index 000000000..e69de29bb diff --git a/langevals/how-to-choose-your-evaluator.mdx b/langevals/how-to-choose-your-evaluator.mdx new file mode 100644 index 000000000..c0d5a3023 --- /dev/null +++ b/langevals/how-to-choose-your-evaluator.mdx @@ -0,0 +1,88 @@ +--- +title: "How to Choose Your Evaluator Guide" +--- +On this page, you can find an elaborate explanation on how to choose the right evaluator depending on the issue that you want to mitigate. Try answering the questions in the proposed order, and you will be able to find yourself with interesting conclusions. + +## RAG Quality +![How to choose RAGAS evaluator](/images/how-to-choose-ragas.png) +Assuming that you have an unsatisfying response by an LLM, you can follow this schema for finding a weak spot in your system. With proper LangWatch integration into your software, you can keep track of both retrieved contexts and the generated responses. Those pieces are crucial for evaluating your RAG system. + +The first thing to check is if the generated answer is faithful to the retrieved contexts. If not, your way to go is a **Faithfulness** evaluator that will fail on every generated answer that is not true to the contexts. This is an important evaluator because it shows if your RAG actually works, and if your LLM is actually using the retrieved context to produce the answer. If this evaluation is failing, it can signalize a bug where the contexts are not being properly used while generating the response. + +In case the responses are faithful, but hallucination persists, the next question to ask is if the answer is relevant to the question that was asked. If it is not, but the retrieved contexts are relevant to the question, then the problem is perhaps in the generation of the answer. Here, the **Answer Relevancy** evaluator comes in handy. It evaluates how relevant the generated response is to the given question. + +However, if the retrieved contexts are not relevant to the question, we need to check if there are any better contexts available in the knowledge base. If there are, here comes into play the **Context Recall** evaluator and the expected outputs also know as ground truths. With the help of the Context Recall evaluator and a set of predefined outputs, we can evaluate to what extent the retrieved contexts match the best possible answer. This gives a good insight into the retrieval capabilities of our RAG. + +At the same time, if there are no better contexts available, we need to check if we don't confuse the model by retrieving too many contexts. If that is the case, the **Context Precision** evaluator is the best option. Some other strategies to mitigate this problem would be to experiment with a different number of retrieved chunks of text or change the length of each chunk while embedding them into a vector database. + +Finally, if you are sure that the amount of retrieved context is fine but the hallucination still persists, maybe you have to ask yourself if a real human was able to respond correctly to your question. At this point you either need to switch the model and improve your prompt or spend more time building a better knowledge base. + +## Security +![How to choose safeguards](/images/how-to-choose-safeguards.jpg) +In case you have no security measures on your chatbot taken, start from the top. +The first important security check to implement is the PII evaluation. +If you do not want to share any personal data of your users with any third parties, you can use the PII Detection evaluator. +The next step is to let your chatbot detect unsafe contents. +Most of the recent LLMs have in-built capabilities to prevent the production of unsafe contents, however, some older versions have a certain level of vulnerability. +You can build a layer that will prevent your LLM from generating any harmful content. + +Follow this schema to ensure your chatbot is safeguarded effectively: + +### Guide to Ensuring Your Chatbot is Safeguarded + +Assume that your chatbot is not safeguarded! Follow the steps below to determine the necessary measures to secure your chatbot effectively. + +#### Step 1: Detection of Personally Identifiable Information (PII) +- **Question:** Does your chatbot detect Personally Identifiable Information? + - **Explanation:** Identifying and managing PII is crucial to protecting user privacy and complying with data protection regulations. + If your chatbot does not have this capability, it is essential to implement **PII Detection** to ensure sensitive information is handled appropriately. This would prevent sharing real names, card numbers, email addresses and other personal data with third parties. + +#### Step 2: Detection of Unsafe Contents +- **Question:** Does your chatbot detect unsafe contents? + - **Explanation:** Detecting unsafe content helps prevent the dissemination of harmful, offensive, or inappropriate information. Most of the modern LLMs have in-built capabilities to detect and ignore requests to produce harmful content. However you have to pay extra attention to this in case you are using an older version of an LLM or if you trained the model yourself. + +#### Step 3: Evaluating Content Safety and Moderation +- **Question:** Do you need extra flexibility by adjusting the policy? + - **Explanation:** If your chatbot requires adaptable policies for handling content, implementing **Llama Guard** is recommended. If not, you should evaluate if the severity level of unsafe content matters for you to decide the necessary measures. + + - **Question:** Does the severity level of unsafe content matter? + - **Explanation:** If the severity level is important, implement **Content Safety** measures to handle different levels of unsafe content severity. If it is not a concern, implement **Moderation** to ensure content is reviewed and handled appropriately. + +#### Step 4: Connection to Other Data Sources +- **Question:** Does your chatbot have a connection to other data sources (e.g., databases, email inbox)? + - **Explanation:** Connecting to other data sources can introduce additional risks. If your chatbot has such connections and is also able to perform actions on the internet or inside of your system - you are in danger zone. Implementing **Prompt Injection** safeguards is crucial to protect against malicious data inputs. + This guardrail can prevent your chatbot from being hijacked by inputting malicious prompts and contents. + +#### Step 5: Protection Against User Jailbreaks +- **Question:** Is your chatbot secure from user jailbreaks (forcing it to produce unethical or criminal content)? + - **Explanation:** Ensuring your chatbot is secure from user jailbreaks is essential for maintaining ethical standards and preventing misuse. If your chatbot is not secure, implementing **Jailbreak Detection** is necessary to prevent and mitigate attempts to bypass security measures. This guardrail is different from Prompt Injection in a way that it protects from forcing the chatbot to reveal its system prompt or acting against any default settings. + + +## Enterprise Readiness +![How to choose enterprise evaluators](/images/how-to-choose-enterprise.jpg) +Many businesses try to meet their goals with the help of chatbots. However, there are many aspects that should be taken care of to make sure these goals can be reached in a coordinated way. + +### Guide to Ensuring Your Chatbot is Enterprise Ready + +First, lets assume that your chatbot is not enterprise ready! Follow the steps below to ensure it meets enterprise standards effectively. + +#### Step 1: Control Over Topics +- **Question:** Can you control the topics that your chatbot talks about? + - **Explanation:** Controlling the topics your chatbot discusses is vital to maintaining focus and relevance. + If you cannot control the topics, use **Off Topic Evaluator** to manage and restrict the chatbot's discussion topics. + This is helpful for preventing malicious users exploiting your chatbot and its tokens for unrelated tasks such as code generation or helping with cooking recipes. + +#### Step 2: Competitor Discussions +- **Question:** Does the chatbot answer questions about your competitors? + - **Explanation:** Answering questions about competitors can be risky. + If your chatbot does this, you need to assess if you know all your competitors by name to implement appropriate measures. + + + - **Question:** Do you know all your competitors by name? + - **Explanation:** Knowing all your competitors by name allows you to implement a **Competitor Blocklist**, restricting discussions about them. It will detect every mention of the predefined competitor with regex and block the message from appearing on user's screen. + If you do not know all your competitors or you work in an industry with a huge competition, you should use **Competitor LLM Check** to identify and manage competitors automatically. This evaluator will leverage the power of LLM to realise if the message is explicitly or implicitly mentions a competitor of your business. + +#### Step 3: Positive Mentions of Your Company or Product +- **Question:** Does your chatbot mention your company or product only in a positive sense? + - **Explanation:** Ensuring that your chatbot only mentions your company or product positively is crucial for maintaining a good reputation. If it does not, implement **Product Sentiment Polarity** to ensure positive mentions only. This evaluator is a LangWatch in-house development that was built with a curated dataset consisting of very positive, subtly positive, subtly negative, very negative product reviews. + diff --git a/langevals/tutorials/ci-cd-pipeline-evaluation.mdx b/langevals/tutorials/ci-cd-pipeline-evaluation.mdx new file mode 100644 index 000000000..3a363169c --- /dev/null +++ b/langevals/tutorials/ci-cd-pipeline-evaluation.mdx @@ -0,0 +1,90 @@ +--- +title: "Evaluate on CI/CD Pipeline" +--- +A good AI applicaiton worth a good software. And a good software is built with a reliable CI/CD pipeline. Here we will explain +how to test your LLM outputs on the level of the deployment pipeline. + +## Prepare the Test Suite +The first step is to create the test suite. You can add as many tests as you want and import required parts of your application. +In this example application we will create a simple test case with handcrafted examples. However, you can load a prepared dataset and test your application on it. +```python test_script.py +import pandas as pd +import pytest +from langevals import expect +from langevals_langevals.off_topic import ( + OffTopicSettings, + OffTopicEvaluator, + AllowedTopic, +) +import os +from dotenv import load_dotenv +load_dotenv() +api_key = os.getenv("OPENAI_API_KEY") + +entries = pd.DataFrame( + { + "input": [ + "What is unique about your product?", + "Write a python code that prints fibonacci numbers.", + "What are the core features of your platform?", + ], + } +) + + +@pytest.mark.parametrize("entry", entries.itertuples()) +def test_extracts_the_right_address(entry): + settings = OffTopicSettings( + allowed_topics=[ + AllowedTopic(topic="our_product", description="Questions about our company and product"), + AllowedTopic(topic="small_talk", description="Small talk questions"), + AllowedTopic(topic="book_a_call", description="Requests to book a call"), + ], + model="openai/gpt-3.5-turbo-1106", + ) + off_topic_evaluator = OffTopicEvaluator(settings=settings) + expect(input=entry.input).to_pass(off_topic_evaluator) +``` + +## Run with GitHub Actions +Second step is to create a GitHub Actions Workflow - specifying the details of your test. You can do that by creating `.github/workflows/` +folder and adding a `.yaml` file there. Here is an example script that will automatically run on every new push to the repo: + +```github Actions +name: Run LangEvals + +on: + push: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pandas pytest "langevals[all]" + + - name: Run tests + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + run: | + pytest ./test_script.py +``` + + +Pay attention - you need to import all of the libraries that are used in your test case and specify the path to your test file. + +After writing the script, you need to go to your GitHub repository and navigate to `Settings > Secrets and variables (left menu) > Actions` and press `New repository secret` button. If you want to use the evaluators employing LLM-as-a-Judge approach - you need to specify the API key of your LLM provider as a repository secret. +Now you can automatically evaluate if any changes to your prompt or change of LLM provider are degrading your application. diff --git a/langevals/tutorials/extensive-unit-testing.mdx b/langevals/tutorials/extensive-unit-testing.mdx new file mode 100644 index 000000000..18f997dbb --- /dev/null +++ b/langevals/tutorials/extensive-unit-testing.mdx @@ -0,0 +1,375 @@ +--- +title: "Extensive Unit Testing" +--- + +Welcome to the Extensive Unit Testing tutorial. This guide will explain how to create a comprehensive test suite for your LLM application using LangEvals. +Our first example use case will focus on the [**Entity Extraction**](https://en.wikipedia.org/wiki/Named-entity_recognition) task. Imagine you have a list of addresses in unstructured text format, and you want to use an LLM to transform it into a spreadsheet. There are many questions you might have, such as which model to choose, how to determine the best model, and how often the model fails to produce the expected results. + +## Prepare the Data +The first step is to model our data using a Pydantic schema. This helps validate and structure the data, making it easier to serialize entries into JSON strings later. +```python +from pydantic import BaseModel + +class Address(BaseModel): + number: int + street_name: str + city: str + country: str +``` + +Once we have modeled our data format, we can create a small dataset with three examples. +```python +import pandas as pd + +entries = pd.DataFrame( + { + "input": [ + "Please send the package to 123 Main St, Springfield.", + "J'ai dรฉmรฉnagรฉ rรฉcemment ร  56 Rue de l'Universitรฉ, Paris.", + "A reuniรฃo serรก na Avenida Paulista, 900, Sรฃo Paulo.", + ], + "expected_output": [ + Address( + number=123, street_name="Main St", city="Springfield", country="USA" + ).model_dump_json(), + Address( + number=56, + street_name="Rue de l'Universitรฉ", + city="Paris", + country="France", + ).model_dump_json(), + Address( + number=900, + street_name="Avenida Paulista", + city="Sรฃo Paulo", + country="Brazil", + ).model_dump_json(), + ], + } +) +``` +In this example `entries` is a [Pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) object with two columns: input and expected_output. The expected_output column contains the expected results, which we will use to compare with the modelโ€™s responses during evaluation. +## Evaluate different models +Now we can start our tests. Letโ€™s compare different models. We define an array with the models weโ€™re interested in and create a litellm client to perform the API calls to these models. Next, we create a test function and annotate it with `@pytest`. + +Our test function calls the LLM with `entry.input` and compares the response with `entry.expected_output`. + +```python +from itertools import product +import pytest +import instructor +from litellm import completion + +models = ["gpt-3.5-turbo", "gpt-4-turbo", "groq/llama3-70b-8192"] + +client = instructor.from_litellm(completion) + + +@pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) +def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + + assert address.model_dump_json() == entry.expected_output + +``` +In this test we leverage `@pytest.mark.parametrize` to run the same test function with different parameters. Using itertools.product, we pair each model with each entry, resulting in 9 different test cases. + +Wow, right? Now you can see how each model performs on a larger scale. + + +At the example test output we can see that `groq/llama3-70b-8192` underperforms on this particular task. +```bash +..F..F..F [100%] +============================================= FAILURES ============================================= +___________________ test_extracts_the_right_address[entry2-groq/llama3-70b-8192] ___________________ + +entry = Pandas(Index=0, input='Please send the package to 123 Main St, Springfield.', expected_output='{"number":123,"street_name":"Main St","city":"Springfield","country":"USA"}') +model = 'groq/llama3-70b-8192' + + @pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) + def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + +> assert address.model_dump_json() == entry.expected_output +E assert '{"number":12..."country":""}' == '{"number":12...untry":"USA"}' +E +E Skipping 60 identical leading characters in diff, use -v to show +E - country":"USA"} +E ? --- +E + country":""} +... +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry2-groq/llama3-70b-8192] - assert '{"number":12..."country":""}' == '{"number":12...untry":"USA"}' +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry5-groq/llama3-70b-8192] - assert '{"number":56..."country":""}' == '{"number":56...ry":"France"}' +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry8-groq/llama3-70b-8192] - assert '{"number":90..."country":""}' == '{"number":90...ry":"Brazil"}' +``` + + + +## Evaluate with a Pass Rate +LLMs are probabilistic by nature, meaning the results of the same test with the same input can vary. However, you can set a `pass_rate` threshold to make the test suite pass even if some tests fail. +```python +@pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) +@pytest.mark.pass_rate(0.6) +def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + + assert address.model_dump_json() == entry.expected_output +``` +In this example we added the second `@pytest` decorator that allows the test result to be a PASS even if only 60% of the tests are successful. For instance, if the LLM sometimes returns โ€œUnited Statesโ€ instead of โ€œUSAโ€, we can still consider it a pass if it meets our acceptable level of uncertainty. + + +In this example, 3 test runs failed (same as in the previous output), marked with `x`. Despite the failures, the overall test suite passes because the `pass_rate` is set to 0.6, and 6 out of 9 tests passed. +```bash +..x..x..x [100%] +========================================= warnings summary ========================================= +.venv/lib/python3.12/site-packages/_pytest/config/__init__.py:1285 + /Users/zhenyabudnyk/DevProjects/langwatch-saas/langevals/notebooks/.venv/lib/python3.12/site-packages/_pytest/config/__init__.py:1285: PytestAssertRewriteWarning: Module already imported so cannot be rewritten: anyio + self._mark_plugins_for_rewrite(hook) + +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry0-gpt-3.5-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry1-gpt-4-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry2-groq/llama3-70b-8192] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry3-gpt-3.5-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry4-gpt-4-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry5-groq/llama3-70b-8192] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry6-gpt-3.5-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry7-gpt-4-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry8-groq/llama3-70b-8192] + /Users/zhenyabudnyk/DevProjects/langwatch-saas/langevals/notebooks/.venv/lib/python3.12/site-packages/instructor/process_response.py:222: DeprecationWarning: FUNCTIONS is deprecated and will be removed in future versions + if mode == Mode.FUNCTIONS: + +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry0-gpt-3.5-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry1-gpt-4-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry2-groq/llama3-70b-8192] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry3-gpt-3.5-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry4-gpt-4-turbo] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry5-groq/llama3-70b-8192] +t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry6-gpt-3.5-turbo] +... + [, , , , , , , , , , , , , , , , , , , , , , , , , , , ] + +===End Flaky Test Report=== +6 passed, 3 xfailed, 19 warnings in 12.66s +``` + + + +## Evaluate with Flaky + +[Flaky](https://github.com/box/flaky) is a special PyTest extension designed for testing software systems that depend on non-deterministic tools such as network communication or AI/ML algorithms. + +```python +@pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) +@pytest.mark.flaky(max_runs=3) +def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + + assert address.model_dump_json() == entry.expected_output + ``` +In this case, each combination of entry and model that fails during its test will be retried up to 2 more times before being marked as a failure. You can also specify the minimum number of passes required before marking the test as a PASS using - `@pytest.mark.flaky(max_runs=3, min_passes=2)`. + + + +Notice the total testing runtime at the bottom of the snippet. It took 34.99 seconds to run these tests with retries compared to 10-12 seconds in the previous versions. +```bash +..F..F.. [100%] [100%]F [100%] +============================================= FAILURES ============================================= +___________________ test_extracts_the_right_address[entry2-groq/llama3-70b-8192] ___________________ + +entry = Pandas(Index=0, input='Please send the package to 123 Main St, Springfield.', expected_output='{"number":123,"street_name":"Main St","city":"Springfield","country":"USA"}') +model = 'groq/llama3-70b-8192' + + @pytest.mark.parametrize("entry, model", product(entries.itertuples(), models)) + @pytest.mark.flaky(max_runs=3) + def test_extracts_the_right_address(entry, model): + address = client.chat.completions.create( + model=model, + response_model=Address, + messages=[ + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) + +> assert address.model_dump_json() == entry.expected_output +E assert '{"number":12..."country":""}' == '{"number":12...untry":"USA"}' +E +E Skipping 60 identical leading characters in diff, use -v to show +E - country":"USA"} +E ? --- +... +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry2-groq/llama3-70b-8192] - assert '{"number":12..."country":""}' == '{"number":12...untry":"USA"}' +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry5-groq/llama3-70b-8192] - assert '{"number":56..."country":""}' == '{"number":56...ry":"France"}' +FAILED t_25254c19b35c4a58a520690924724e02.py::test_extracts_the_right_address[entry8-groq/llama3-70b-8192] - assert '{"number":90..."country":""}' == '{"number":90...ry":"Brazil"}' +3 failed, 6 passed, 35 warnings in 34.99s +``` + + + +## LLM-as-a-Judge and `expect` + +Lets take another use-case - generation of recipes. As the task becomes more nuanced it is also harder to properly evaluate the quality of LLM's response. +LLM-as-a-Judge approach comes in hand in such situations. For example, you can use `CustomLLMBooleanEvaluator` to check if the generated recipes are all vegetarian. + +```python +from langevals import expect +import litellm +import pandas as pd +import pytest + +entries = pd.DataFrame( + { + "input": [ + "Generate me a recipe for a quick breakfast with bacon", + "Generate me a recipe for a lunch using lentils", + "Generate me a recipe for a vegetarian dessert", + ], + } +) + +@pytest.mark.parametrize("entry", entries.itertuples()) +@pytest.mark.flaky(max_runs=3) +@pytest.mark.pass_rate(0.8) +def test_generate_tweet_recipes(entry): + response: ModelResponse = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a tweet-size recipe generator, just recipe name and ingredients, no yapping.", + }, + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) # type: ignore + recipe = response.choices[0].message.content # type: ignore + + vegetarian_checker = CustomLLMBooleanEvaluator( + settings=CustomLLMBooleanSettings( + prompt="Is the recipe vegetarian?", + ) + ) + + expect(input=entry.input, output=recipe).to_pass(vegetarian_checker) + ``` + + Pay attention how we use the `expect` at the end of our test. This is a special assertion utility function that simplifies the + evaluation run and prints a nice output with the detailed explanation in case of failures. + The `expect` utility interface is modeled after Jest assertions, so you can expect a somewhat similar API if you are expericed with Jest. + + + +```bash +FAILED tests/test_llm_as_judge.py::test_llm_as_judge[entry0] - AssertionError: Custom LLM Boolean Evaluator to_pass FAILED - The recipe for a quick breakfast with bacon includes bacon strips, making it a non-vegetarian recipe. +``` + + + +## Other Evaluators +Just like `CustomLLMBooleanEvaluator`, you can use any other evaluator available from LangEvals to prevent regression on a variety of cases, +for example, here we check that the LLM answers are always in english, regardless of the language used in the question, we also measure how relevant the answers are to the question: +```python +import litellm +from litellm import ModelResponse +from langevals_lingua.language_detection import ( + LinguaLanguageDetectionEvaluator, + LinguaLanguageDetectionSettings, + LinguaLanguageDetectionEvaluator, +) +from langevals_ragas.answer_relevancy import RagasAnswerRelevancyEvaluator +from langevals import expect +import pytest +entries = pd.DataFrame( + { + "input": [ + "What's the connection between 'breaking the ice' and the Titanic's first voyage?", + "Comment la bataille de Verdun a-t-elle influencรฉ la cuisine franรงaise?", + "ยฟPuede el musgo participar en la purificaciรณn del aire en espacios cerrados?", + ], + } +) + + +@pytest.mark.parametrize("entry", entries.itertuples()) +@pytest.mark.flaky(max_runs=3) +@pytest.mark.pass_rate(0.8) +def test_language_and_relevancy(entry): + response: ModelResponse = litellm.completion( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You reply questions only in english, no matter tha language the question was asked", + }, + {"role": "user", "content": entry.input}, + ], + temperature=0.0, + ) # type: ignore + recipe = response.choices[0].message.content # type: ignore + + language_checker = LinguaLanguageDetectionEvaluator( + settings=LinguaLanguageDetectionSettings( + check_for="output_matches_language", + expected_language="EN", + ) + ) + answer_relevancy_checker = RagasAnswerRelevancyEvaluator() + + expect(input=entry.input, output=recipe).to_pass(language_checker) + expect(input=entry.input, output=recipe).score( + answer_relevancy_checker + ).to_be_greater_than(0.8) +``` +In this example we are now not only validating a boolean assertion, +but also making sure that 80% of our samples keep an answer relevancy score above 0.8 from the Ragas Answer Relevancy Evaluator. + + + +```bash +===Flaky Test Report=== + +test_language_and_relevancy[entry0] passed 1 out of the required 1 times. Success! +test_language_and_relevancy[entry1] passed 1 out of the required 1 times. Success! +test_language_and_relevancy[entry2] passed 1 out of the required 1 times. Success! + +===End Flaky Test Report=== +``` + + + + + + You can access and run the code yourself in Jupyter Notebook + + + + + diff --git a/langevals/tutorials/rag-evaluation.mdx b/langevals/tutorials/rag-evaluation.mdx new file mode 100644 index 000000000..5dca5c927 --- /dev/null +++ b/langevals/tutorials/rag-evaluation.mdx @@ -0,0 +1,54 @@ +--- +title: "RAG Evaluation" +--- + +In this tutorial we will show how you can evaluate your RAG application with the help of LangEvals. +Check [RAGs Context Tracking](/integration/rags-context-tracking) to learn how to integrate your application with LangWatch and trace your RAG calls. + + +## LangEvals RAG Evaluators +You can easily import RAG evaluators from the `langevals_ragas` module and quickly make use of their features. +```python +from langevals_ragas.context_relevancy import RagasContextRelevancyEntry, RagasContextRelevancyEvaluator +from langevals_ragas.faithfulness import RagasFaithfulnessEntry, RagasFaithfulnessEvaluator +from langevals_ragas.answer_relevancy import RagasAnswerRelevancyEntry, RagasAnswerRelevancyEvaluator + +entry1 = RagasAnswerRelevancyEntry(input="What is the capital of France?", output="Paris is the capital of France") + +entry2 = RagasContextRelevancyEntry(output="Paris is the capital of France", contexts=[ + "Water can evaporate or turn into ice", + "Dogs and Cats can be friends", + "The sun is shining today" +]) + +entry3 = RagasFaithfulnessEntry(output="Paris is the capital of France", contexts=[ + "France is a country in Europe", + "Lyon, Paris and Bordeaux are cities in France", + "Paris is the capital of France" +]) + +result1 = RagasAnswerRelevancyEvaluator().evaluate(entry=entry1) +result2 = RagasContextRelevancyEvaluator().evaluate(entry=entry2) +result3 = RagasFaithfulnessEvaluator().evaluate(entry=entry3) +``` +In the example above we import 3 evaluators and create 3 corresponding entries that are evaluated at the end. +Each entry requires different set of input parameters, depending on what we want to evaluate. Pay attention to the `entry2`, +how do you think, what will be the result of its evaluation on context relevancy? + + +As you can see the result of the evaluation is an object with a few attributes. It returns the status of the evaluation, +the score of the evaluation and the cost (as far as RAG evaluators use another LLM to perform evaluation). +```bash +status='processed' score=1.0 passed=None details=None cost=Money(currency='USD', amount=0.002036) +status='processed' score=0.3333333333333333 passed=None details=None cost=Money(currency='USD', amount=0.00033600000000000004) +status='processed' score=1.0 passed=None details=None cost=Money(currency='USD', amount=0.0038910000000000004) +``` +The second result corresponds to the `entry2` being evaluated on context relevancy. With no surprise we observe a low score of 0.333, +as the output is absolutely unrelated to the given contexts. This might be a sign that your RAG is retrieving the wrong documents or that +it generates responses that are not based on the given contexts. + + + + + You can access and run the code yourself in Jupyter Notebook + \ No newline at end of file diff --git a/llms.txt b/llms.txt new file mode 100644 index 000000000..7c11abd21 --- /dev/null +++ b/llms.txt @@ -0,0 +1,2557 @@ +# LangWatch + +# FILE: ./introduction.mdx + +--- +title: Introduction +--- + +Welcome to LangWatch, the all-in-one [open-source](https://github.com/langwatch/langwatch) LLMops platform. + +LangWatch allows you to track, monitor, guardrail and evaluate your LLMs apps for measuring quality and alert on issues. + +For domain experts, it allows you to easily sift through conversations, see topics being discussed and annotate and score messages +for improvement in a collaborative manner with the development team. + +For developers, it allows you to debug, build datasets, prompt engineer on the playground and +run batch evaluations or [DSPy experiments](./dspy-visualization/quickstart) to continuously improve the product. + +Finally, for the business, it allows you to track conversation metrics and give full user and quality analytics, cost tracking, build +custom dashboards and even integrate it back on your own platform for reporting to your customers. + +You can [sign up](https://app.langwatch.ai/) and already start the integration on our free tier by following the guides bellow: + + + + + + + +You can also [open the demo project](https://app.langwatch.ai/demo) check out a [video](https://www.loom.com/share/17f827b1f5a648298779b36e2dc959e6) on our platform. + +## Get in touch + +Feel free to reach out to us directly at [support@langwatch.ai](mailto:support@langwatch.ai). You can also open a [GitHub issue](https://github.com/langwatch/langwatch/issues) +to report bugs and request features, or join our [Discord](https://discord.gg/kT4PhDS2gH) channel and ask questions directly for the community and the core team. +--- + +# FILE: ./concepts.mdx + +--- +title: Concepts +--- + +Understanding LangWatch concepts can be made easier with two practical examples: an AI travel assistant and a tool for generating blog posts. Let's dive into how each core concept of LangWatch applies to these examples. + +Imagine you've created an AI travel assistant that helps users plan their trips by conversing with them to suggest destinations, find the best prices for flights, and assist with bookings. On the other hand, you also have a platform that assists users in generating, and refining blog posts, including SEO optimization. + +### Threads + +Field: `thread_id` + +A **thread** in the context of the AI travel assistant represents a complete conversation, that is, the group of all traces. It's the entire chat that groups all back-and-forth messages as the user inquires about different aspects of their travel plan. For the blog post tool, a thread could be for example the creation process of a new blog post, encapsulating all interactions that contribute to its completionโ€”from headline generation to the final SEO adjustments. + +### Traces + +Field: `trace_id` + +A **trace** in the travel assistant's example is each distinct message, for example when a user asks for the best prices for a destination, or asks if pets are allowed in the hotel. + +In the blog post tool case, a trace could be for example each time a new generation of a catchy headline option happens, or the generation of a draft for the body, or the SEO keywords generation. + +It does not matter how many steps are inside, each trace is a full end-to-end generation handled by the AI. + +The `trace_id` is by default randomly generated if you don't provide one, however, to keep control of your traces and connect them to events like [Thumbs Up/Down](./user-events/thumbs-up-down), we recommend generating a random id on your side, using, for example the [nanoid](https://pypi.org/project/nanoid/) library. + +### Spans + +Field: `span_id` + +Within each trace, **spans** represent the individual steps taken to achieve the outcome. In the travel bot scenario, a span could be a call to the LLM to suggest potential destinations, another span for querying the airline price API, and a final span for formatting the response to present to the user. For the blog post tool, one span might be the initial text generation, followed by a subsequent span for LLM to self-critiquing the content, and another span for the third LLM call refining the text based on the critique. + +### User ID + +Field: `user_id` + +The **user id** identifies the ID of the final user of the product. In the context of both the AI travel assistant and the tool for generating blog posts, it's the ID that identifies the person using the app, usually their user account ID, this allows LangWatch to track how end users are using the product. + +### Customer ID + +Field: `customer_id` + +The **customer id** is used when you provide a platform for your customers to build LLM apps for their end users. For example, it would be if your are building a platform that allow _others_ to build AI assistants for _their_ users. Having the **customer id** allows LangWatch to group all metrics and messages per customer, which allows you to access LangWatch data through our APIs to build a custom analytics dashboard for your customers, so they can see how their own LLM assistants are behaving. + +### Labels + +Field: `labels` + +You can use **labels** to organize and compare the traces sent to LangWatch for any comparison you want to do. You can for example apply different labels for different actions, for example a label `blogpost_title` for generating the blog post title and another `blogpost_keywords`, for generating keywords. You can use it for versioning as well, for example label the first implementation +version as `v1.0.0`, then do a prompt engineering to improve the AI travel planner itenerary builder, and label it as `v1.0.1`. This way you can easily focus on each different functionality or compare versions on LangWatch dashboard. + +--- + +# FILE: ./integration/overview.mdx + +Integrating LangWatch into your projects is designed to be a straightforward process. Regardless of the language or LLM model you are using, you can set up LangWatch with minimal configuration and start gathering valuable insights into your LLM's performance and user interactions. + + + + + + + +--- + +# FILE: ./integration/python/reference.mdx + +--- +title: Python SDK Reference +sidebarTitle: Reference +--- + +This page contains the low-level reference for the Python SDK components, for guide on integrating LangWatch into your Python project, see [Python Integration Guide](/integration/python/guide). + +## Trace + +The trace is the basic unit of work in LangWatch. It is a collection of spans that are grouped together to form a single unit of work, you can create a trace in three manners: + +```python +import langwatch + +# As a decorator: +@langwatch.trace() +def my_function(): + pass + + +# As a context manager +with langwatch.trace(): + pass + + +# As a function +trace = langwatch.trace() +``` + +All three ways will create the same trace objects, but for the last one you manually need to call `trace.deferred_send_spans()` or `trace.send_spans()` to send the spans to the LangWatch API. + +The first two will also set the trace to the context, which you can retrieve by: + +``` +trace = langwatch.get_current_trace() +``` + +Both on the trace creation function and `.update()` you can set trace_id, metadata and api_key to be used by the trace. + +| Parameter | Type | Description | +| :-------- | :--- | :---------- | +| trace_id | `str` | The trace id to use for the trace, a random one is generated by default, but you can also pass your own to connect with your internal message id if you have it. | +| metadata | `dict` | The object holding metadata for the trace, it contains a few fields listed below. | +| metadata.user_id | `str` | The user id that is triggering the generation on your LLM pipeline | +| metadata.thread_id | `str` | A thread id can be used to virtually group together all the different traces in a single thread or workflow | +| metadata.labels | `list[str]` | A list of labels to categorize the trace which you can use to filter on later on LangWatch dashboard, trigger evaluations and alerts | +| api_key | `str` | The api key to use for the trace, can be set to override the LANGWATCH_API_KEY environment variable. | + + +## Span + +A Span is a single unit of work in a trace, it is the smallest unit of work in LangWatch. Similar to traces, you can create it in three different manners: + +```python +import langwatch + +# As a decorator +@langwatch.span() +def my_function(): + pass + +# As a context manager +with langwatch.span(): + pass + +# As a function +span = langwatch.span() +``` + +All three ways will create the same span objects, but for the last one you need to manually end the span by calling `span.end()`, which may also take parameters for updating the span data: + +```python +span.end(output="sunny") +``` + +The first two will also set the span to the context, which you can retrieve by: + +``` +span = langwatch.get_current_span() +``` + +By default, when the span is created it becomes the child of the current span in context, but you can also explicitly create a children span from a trace or from another span by initiating them from the parent, for example: + +```python +trace = langwatch.trace() # or langwatch.get_current_trace() + +# Direct child of the trace +span = trace.span(name="child") + +# Child of another span, granchild of the trace +subspan = span.span(name="grandchild") + +subspan.end() +span.end() + +trace.deferred_send_spans() +``` + +Both on the span creation function, `.update()` and `.end()` functions you can set span parameters: + +| Parameter | Type | Description | +| :-------- | :--- | :---------- | +| span_id | `str` | The span id to use for the span, a random one is generated by default. | +| name | `str` | The name of the span, automatically inferred from the function when using the `@langwatch.span()` decorator. | +| type | `"span" \| "rag" \| "llm" \| "chain" \| "tool" \| "agent" \| "guardrail"` | The type of the span, defaults to `span`, with `rag` and `llm` spans allowing for some extra parameters. | +| parent | `ContextSpan` | The parent span to use for the span, if not set, the current span in context is used as the parent. | +| capture_input | `bool` | Available only on the `@langwatch.span()` decorator, whether to capture the input of the function, defaults to `True`. | +| capture_output | `bool` | Available only on the `@langwatch.span()` decorator, whether to capture the output of the function, defaults to `True`. | +| input | `str \| list[ChatMessage] \| SpanInputOutput` | The span input, it can be either a string, or a list of OpenAI-compatible chat messages format dicts, or a `SpanInputOutput` object, which captures other generic types such as `{ "type": "json", "value": {...} }`. | +| output | `str \| list[ChatMessage] \| SpanInputOutput` | The span output, it can be either a string, or a list of OpenAI-compatible chat messages format dicts, or a `SpanInputOutput` object, which captures other generic types such as `{ "type": "json", "value": {...} }`. | +| error | `Exception` | The error that occurred during the function execution, if any. It is automatically captured with the `@langwatch.span()` decorator and context manager. | +| timestamps | `SpanTimestamps` | The timestamps of the span, tracked by default when using the `@langwatch.span()` decorator and context manager. | +| timestamps.started_at | `int` | The start time of the span in milliseconds, the current time is used by default when the span starts. | +| timestamps.first_token_at | `int` | The time when the first token was generated in milliseconds, automatically tracked for streaming LLMs when using framework integrations. | +| timestamps.finished_at | `int` | The time when the span finished in milliseconds, the current time is used by default when the span ends. | +| contexts | `list[str] \| list[RAGChunk]` | **RAG only:** The list of contexts retrieved by the RAG, manually captured to be used later as the context source for RAG evaluators. Check out the [Capturing a RAG Span](/integration/python/guide#capturing-a-rag-span) guide for more information. | +| model | `str` | **LLM only:** The model used for the LLM in the `"vendor/model"` format (e.g. `"openai/gpt-3.5-turbo"`), automatically captured when using framework integrations, otherwise important to manually set it for correct tokens and costs tracking. | +| params | `LLMSpanParams` | **LLM only:** The parameters used for the LLM, on which parameters were used by the LLM call, automatically captured when using framework integrations | +| params.temperature | `float` | **LLM only:** The temperature used for the LLM | +| params.stream | `bool` | **LLM only:** Whether the LLM is streaming or not | +| params.tools | `list[dict]` | **LLM only:** OpenAI-compatible tools list available to the LLM | +| params.tool_choice | `str` | **LLM only:** The OpenAI-compatible tool_choice setting for the LLM | +| metrics | `LLMSpanMetrics` | **LLM only:** The metrics of the LLM span, automatically captured when using framework integrations | +| metrics.prompt_tokens | `int` | **LLM only:** The number of prompt tokens used by the LLM | +| metrics.completion_tokens | `int` | **LLM only:** The number of completion tokens used by the LLM | +--- + +# FILE: ./integration/python/guide.mdx + +--- +title: Python Integration Guide +sidebarTitle: Guide +--- + +
+
+ + LangWatch Python Repo + +
+ +
+ + LangWatch Python SDK version + +
+
+ +LangWatch library is the easiest way to integrate your Python application with LangWatch, the messages are synced on the background so it doesn't intercept or block your LLM calls. + +#### Prerequisites + +- Obtain your `LANGWATCH_API_KEY` from the [LangWatch dashboard](https://app.langwatch.com/). + +#### Installation + +```sh +pip install langwatch +``` + +#### Configuration + +Ensure `LANGWATCH_API_KEY` is set: + + +### Environment variable + +```bash +export LANGWATCH_API_KEY='your_api_key_here' +``` + +### Runtime + +You can set `LANGWATCH_API_KEY` globally at runtime: + +```python +import langwatch +import os + +langwatch.api_key = os.getenv("LANGWATCH_API_KEY") +``` + +Or on the specific trace being tracked: + +```python +import langwatch +import os + +@langwatch.trace(api_key=os.getenv("LANGWATCH_API_KEY")) +def main(): + ... +``` + +## Capturing Messages + +- Each message triggering your LLM pipeline as a whole is captured with a [Trace](/concepts#traces). +- A [Trace](/concepts#traces) contains multiple [Spans](/concepts#spans), which are the steps inside your pipeline. + - A span can be an LLM call, a database query for a RAG retrieval, or a simple function transformation. + - Different types of [Spans](/concepts#spans) capture different parameters. + - [Spans](/concepts#spans) can be nested to capture the pipeline structure. +- [Traces](/concepts#traces) can be grouped together on LangWatch Dashboard by having the same [`thread_id`](/concepts#threads) in their metadata, making the individual messages become part of a conversation. + - It is also recommended to provide the [`user_id`](/concepts#user-id) metadata to track user analytics. + +## Create a Trace + +To capture traces and spans, start by adding the `@langwatch.trace()` decorator to the function that starts your LLM pipeline. Here it is represented by the `main()` function, but it can be your endpoint call or your class method that triggers the whole generation. + +```python +import langwatch + +@langwatch.trace() +def main(): + ... +``` + +This is the main entry point for your trace, and all spans called from here will be collected automatically to LangWatch in the background. + + +On short-live environments like Lambdas or Serverless Functions, be sure to call
`langwatch.get_current_trace().send_spans()` before your trace function ends to wait for all pending requests to be sent before the runtime is destroyed. +
+ +## Capturing LLM Spans + +LangWatch provides some utilities to automatically capture spans for popular LLM frameworks. + + +### OpenAI + +For OpenAI, you can use the `autotrack_openai_calls()` function to automatically capture LLM spans for OpenAI calls for the current trace. + +```python +import langwatch +from openai import OpenAI + +client = OpenAI() + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_openai_calls(client) + ... +``` + +That's enough to have your OpenAI calls collected and visible on LangWatch dashboard: + +![OpenAI Spans](/images/integration/openai.png) + +### Azure + +For Azure OpenAI, you can use the `autotrack_openai_calls()` function to automatically capture LLM spans for Azure OpenAI calls for the current trace. + +```python +import langwatch +from openai import AzureOpenAI + +client = AzureOpenAI() + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_openai_calls(client) + ... +``` + +That's enough to have your Azure OpenAI calls collected and visible on LangWatch dashboard: + +![Azure OpenAI Spans](/images/integration/azure.png) + +### LiteLLM + +You can use [LiteLLM](https://github.com/BerriAI/litellm) to call OpenAI, Anthropic, Gemini, Groq Llama 3 and over 100+ LLM models. + +And for tracking it all with LangWatch, you can use the `autotrack_litellm_calls()` function to automatically capture LLM spans for LiteLLM calls for the current trace. + +```python +import langwatch +import litellm + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_litellm_calls(litellm) + + response = litellm.completion( + ... + ) +``` + + +Since we patch the `completion` method of the `litellm` module, you must use `litellm.completion()` instead of just `completion()` when calling your LLM, otherwise LangWatch will not be able to capture the spans. + + +That's enough to have your LiteLLM calls collected and visible on LangWatch dashboard: + +![LiteLLM Spans](/images/integration/litellm.png) + +### DSPy + +[DSPy](https://github.com/stanfordnlp/dspy) is the LLM framework that automatically optimizes prompts, you can use LangWatch both for [visualizing](/dspy-visualization/quickstart) the +optimization process, and for tracking the calls during inference as this guide shows. + +To track DSPy programs, you can use the `autotrack_dspy()` function to automatically capture DSPy modules forward pass, retrievers and LLM calls for the current trace. + +```python +import langwatch +import dspy + +@langwatch.trace() +def main(): + langwatch.get_current_trace().autotrack_dspy() + + program = MyDspyProgram() + response = program( + ... + ) +``` + +That's enough to have your DSPy traces collected and visible on LangWatch dashboard: + +![DSPy Spans](/images/integration/dspy.png) + +### LangChain + +For LangChain, you can automatically capture every step of your chain as a span by getting a LangChain callback for the current trace with `get_langchain_callback()`. + +```python +import langwatch + +@langwatch.trace() +def main(): + ... + chain.invoke( + {"input": user_input}, + # Add the LangWatch callback when invoking your chain + {"callbacks": [langwatch.get_current_trace().get_langchain_callback()]}, + ) +``` + +That's enough to have your LangChain calls collected and visible on LangWatch dashboard: + +![LangChain Spans](/images/integration/langchain.png) + +Check out for more python integration examples on the [examples folder on our GitHub repo](https://github.com/langwatch/langwatch/tree/main/python-sdk/examples). + +## Adding metadata + +You can add metadata to track the user_id and current conversation thread_id, this is highly recommended to unlock better conversation grouping and user analytics on LangWatch. + +```python +import langwatch + +@langwatch.trace() +def main(): + langwatch.get_current_trace().update(metadata={"user_id": "user_id", "thread_id": "thread_id"}) + ... +``` + +You can also add custom labels to your trace to help you better filter and group your traces, or even trigger specific evaluations and alerts. + +```python +import langwatch + +@langwatch.trace() +def main(): + langwatch.get_current_trace().update(metadata={"labels": ["production"]}) + ... +``` + +Check out the [reference](./reference#trace) to see all the available trace properties. + +## Changing the Message Input and Output + +By default, the main input and output of the trace displayed on LangWatch is captured from the arguments and return value of +the top-level decorated function and heuristics try to extract the human-readable message from it automatically. + +However, sometimes more complex structures are used and the messages might not end up very human-readable on LangWatch, for example: + +![Raw Input and Output](/images/integration/message-raw-input-output.png) + +To make the messages really easy to read in the list and through the whole conversation, you can manually set what +should the input and output of the trace be, by calling `.update(input=...)` and `.update(output=...)` on the current trace: + +```python +import langwatch + +@langwatch.trace() +def main(inputs): + # Update the input of the trace with the user message or any other human-readable text + langwatch.get_current_trace().update(input=inputs.question) + + ... + + # Then, before returning, update the output of the trace with final response + langwatch.get_current_trace().update(output=response) + + return response +``` + +This will make the messages on LangWatch look like this: + +![Custom Input and Output](/images/integration/message-custom-input-output.png) + +## Capturing a RAG span + +RAG is a combination of a retrieval and a generation step, LangWatch provides a special span type for RAG that captures both steps separately which allows to capture the `contexts` being used by the LLM on your pipeline. +By capturing the `contexts`, you unlock various uses of it on LangWatch, like RAG evaluators such as Faitfhfulness and Context Relevancy, and analytics on which documents are being used the most. + + +### RAG Span + +To capture a RAG span, you can use the `@langwatch.span(type="rag")` decorator, along with a call to `.update()` to add the `contexts` to the span: + +```python +@langwatch.span(type="rag") +def rag_retrieval(): + # the documents you retrieved from your vector database + search_results = ["France is a country in Europe.", "Paris is the capital of France."] + + # capture them on the span contexts before returning + langwatch.get_current_span().update(contexts=search_results) + + return search_results +``` + +If you have document or chunk ids from the results, we recommend you can to capture them along with the id using `RAGChunk`, as this allows them to be grouped together and generate documents analytics on LangWatch dashboard: + +```python +from langwatch.types import RAGChunk + +@langwatch.span(type="rag") +def rag_retrieval(): + # the documents you retrieved from your vector database + search_results = [ + { + "id": "doc-1", + "content": "France is a country in Europe.", + }, + { + "id": "doc-2", + "content": "Paris is the capital of France.", + }, + ] + + # capture then on the span contexts with RAGChunk before returning + langwatch.get_current_span().update( + contexts=[ + RAGChunk( + document_id=document["id"], + content=document["content"], + ) + for document in search_results + ] + ) + + return search_results +``` + +Then you'll be able to see the captured contexts that will also be used later on for evaluatios on LangWatch dashboard: + +![RAG Spans](/images/integration/rag.png) + +### LangChain + +When using LangChain, generally your RAG happens by calling a [`Retriever`](https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/). + +We provide a utility `langwatch.langchain.capture_rag_from_retriever` to capture the documents found by the retriever and convert it into a LangWatch compatible format for tracking. For that you need to pass the retriever as first argument, and then a function to map each document to a `RAGChunk`, like in the example below: + +```python +import langwatch +from langwatch.types import RAGChunk + +@langwatch.trace() +def main(): + retriever = ... + retriever_tool = create_retriever_tool( + langwatch.langchain.capture_rag_from_retriever( + retriever, + lambda document: RAGChunk( + document_id=document.metadata["source"], + content=document.page_content + ), + ), + "langwatch_search", + "Search for information about LangWatch. For any questions about LangWatch, use this tool if you didn't already", + ) + + tools = [retriever_tool] + model = ChatOpenAI(streaming=True) + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis and use tools only once.\n\n{agent_scratchpad}", + ), + ("human", "{question}"), + ] + ) + agent = create_tool_calling_agent(model, tools, prompt) + executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + return executor.invoke(user_input, config=RunnableConfig( + callbacks=[langwatch.get_current_trace().get_langchain_callback()] + )) +``` + +Alternatively, if you don't use retrievers, but still want to capture the context for example from a tool call that you do, we also provide a utility `langwatch.langchain.capture_rag_from_tool` to capture RAG contexts around a tool. For that you need to pass the tool as first argument, and then a function to map the tool's output to `RAGChunk`s, like in the example below: + +```python +import langwatch +from langwatch.types import RAGChunk + +@langwatch.trace() +def main(): + my_custom_tool = ... + wrapped_tool = langwatch.langchain.capture_rag_from_tool( + my_custom_tool, lambda response: [ + RAGChunk( + document_id=response["id"], # optional + chunk_id=response["chunk_id"], # optional + content=response["content"] + ) + ] + ) + + tools = [wrapped_tool] # use the new wrapped tool in your agent instead of the original one + model = ChatOpenAI(streaming=True) + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis and use tools only once.\n\n{agent_scratchpad}", + ), + ("human", "{question}"), + ] + ) + agent = create_tool_calling_agent(model, tools, prompt) + executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + return executor.invoke(user_input, config=RunnableConfig( + callbacks=[langWatchCallback] + )) +``` + +Then you'll be able to see the captured contexts that will also be used later on for evaluatios on LangWatch dashboard: + +![RAG Spans](/images/integration/langchain-rag.png) + +## Capturing other spans + +To be able to inspect and debug each step of your pipeline along with the LLM calls, you can use the `@langwatch.span()` decorator. You can pass in different `type`s to categorize your spans. + +```python +import langwatch + +@langwatch.span() +def database_query(): + ... + +@langwatch.span(type="tool") +def weather_forecast(city: str): + ... + +@langwatch.span(type="rag") +def rag_retrieval(): + ... + +# You can manually track llm calls too if the automatic capture is not enough for your use case +@langwatch.span(type="llm") +def llm_call(): + ... + +@langwatch.trace() +def main(): + ... +``` + +The input and output of the decorated function are automatically captured in the span, to disable that, you can set `capture_input` and `capture_output` to `False`: + +```python +@langwatch.span(capture_input=False, capture_output=False) +def database_query(): + ... +``` + +You can also modify the current spans attributes, either on the decorator by using `.update()` on the current span: + +```python +@langwatch.span(type="llm", name="custom_name") +def llm_call(): + langwatch.get_current_span().update(model="my-custom-model") + ... +``` + +Check out the [reference](./reference#span) to see all the available span properties. + +## Capturing custom evaluation results + +[LangWatch Evaluators](/evaluations/overview) can run automatically on your traces, but if you have an in-house custom evaluator, you can also capture the evaluation +results of your custom evaluator on the current trace or span by using the `.add_evaluation` method: + +```python +import langwatch + +@langwatch.span(type="evaluation") +def evaluation_step(): + ... # your custom evaluation logic + + langwatch.get_current_span().add_evaluation( + name="custom evaluation", # required + passed=True, + score=0.5, + label="category_detected", + details="explanation of the evaluation results", + ) +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. + +## Synchronizing your message IDs with LangWatch traces + +If you store the messages in a database on your side as well, you set the `trace_id` of the current trace to the same one of the message on your side, this way your system will be in sync with LangWatch traces, making it easier to investigate later on. + +```python +@langwatch.trace() +def main(): + ... + langwatch.get_current_trace().update(trace_id=message_id) + ... +``` +--- + +# FILE: ./integration/typescript/guide.mdx + +--- +title: TypeScript Integration Guide +sidebarTitle: Guide +--- + +
+
+ + LangWatch TypeScript Repo + +
+ +
+ + LangWatch TypeScript SDK version + +
+
+ +LangWatch library is the easiest way to integrate your TypeScript application with LangWatch, the messages are synced on the background so it doesn't intercept or block your LLM calls. + +#### Prerequisites + +- Obtain your `LANGWATCH_API_KEY` from the [LangWatch dashboard](https://app.langwatch.com/). + +#### Installation + +```sh +npm install langwatch +``` + +#### Configuration + +Ensure `LANGWATCH_API_KEY` is set: + + +### Environment variable + +```bash .env +LANGWATCH_API_KEY='your_api_key_here' +``` + +### Client parameters + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch({ + apiKey: 'your_api_key_here', +}); +``` + +## Basic Concepts + +- Each message triggering your LLM pipeline as a whole is captured with a [Trace](/concepts#traces). +- A [Trace](/concepts#traces) contains multiple [Spans](/concepts#spans), which are the steps inside your pipeline. + - A span can be an LLM call, a database query for a RAG retrieval, or a simple function transformation. + - Different types of [Spans](/concepts#spans) capture different parameters. + - [Spans](/concepts#spans) can be nested to capture the pipeline structure. +- [Traces](/concepts#traces) can be grouped together on LangWatch Dashboard by having the same [`thread_id`](/concepts#threads) in their metadata, making the individual messages become part of a conversation. + - It is also recommended to provide the [`user_id`](/concepts#user-id) metadata to track user analytics. + + +## Integration + + +### Vercel AI SDK + +The Vercel AI SDK supports tracing via Next.js OpenTelemetry integration. By using the `LangWatchExporter`, you can automatically collect those traces to LangWatch. + +First, you need to install the necessary dependencies: + +```bash +npm install @vercel/otel langwatch @opentelemetry/api-logs @opentelemetry/instrumentation @opentelemetry/sdk-logs +``` + +Then, set up the OpenTelemetry for your application, follow one of the tabs below depending whether you are using AI SDK with Next.js or on Node.js: + + +### Next.js + +You need to enable the `instrumentationHook` in your `next.config.js` file if you haven't already: + +```javascript +/** @type {import('next').NextConfig} */ +const nextConfig = { + experimental: { + instrumentationHook: true, + }, +}; + +module.exports = nextConfig; +``` + +Next, you need to create a file named `instrumentation.ts` (or `.js`) in the __root directory__ of the project (or inside `src` folder if using one), with `LangWatchExporter` as the traceExporter: + +```typescript +import { registerOTel } from '@vercel/otel' +import { LangWatchExporter } from 'langwatch' + +export function register() { + registerOTel({ + serviceName: 'next-app', + traceExporter: new LangWatchExporter({ + apiKey: process.env.LANGWATCH_API_KEY + }) + }) +} +``` + +(Read more about Next.js OpenTelemetry configuration [on the official guide](https://nextjs.org/docs/app/building-your-application/optimizing/open-telemetry#manual-opentelemetry-configuration)) + +Finally, enable `experimental_telemetry` tracking on the AI SDK calls you want to trace: + +```typescript +const result = await generateText({ + model: openai('gpt-4o-mini'), + prompt: 'Explain why a chicken would make a terrible astronaut, be creative and humorous about it.', + experimental_telemetry: { + isEnabled: true, + // optional metadata + metadata: { + userId: "myuser-123", + threadId: "mythread-123", + }, + }, +}); +``` + +### Node.js + +For Node.js, start by following the official OpenTelemetry guide: + +- [OpenTelemetry Node.js Getting Started](https://opentelemetry.io/docs/languages/js/getting-started/nodejs/) + +Once you have set up OpenTelemetry, you can use the `LangWatchExporter` to automatically send your traces to LangWatch: + +```typescript +import { LangWatchExporter } from 'langwatch' + +const sdk = new NodeSDK({ + traceExporter: new LangWatchExporter({ + apiKey: process.env.LANGWATCH_API_KEY + }), + // ... +}); +``` + +That's it! Your messages will now be visible on LangWatch: + +![Vercel AI SDK](/images/integration/vercel-ai-sdk.png) + +## Example Project + +You can find a full example project with a more complex pipeline and Vercel AI SDK and LangWatch integration [on our GitHub](https://github.com/langwatch/langwatch/blob/main/typescript-sdk/example/lib/chat/vercel-ai.tsx). + +## Manual Integration + +The docs from here below are for manual integration, in case you are not using the Vercel AI SDK OpenTelemetry integration, +you can manually start a trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +```typescript +import { convertFromVercelAIMessages } from 'langwatch' + +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: convertFromVercelAIMessages(messages) + }, +}); +``` + +This will capture the LLM input and register the time the call started. Once the LLM call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation, e.g.: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: convertFromVercelAIMessages(output), // assuming output is Message[] + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + + +### OpenAI + +Start by initializing LangWatch client and creating a new trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then to capture your LLM calls, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +First, define the model and the messages you are going to use for your LLM call separately, so you can capture them: + +```typescript +import { OpenAI } from "openai"; + +// Model to be used and messages that will be sent to the LLM +const model = "gpt-4o" +const messages : OpenAI.Chat.ChatCompletionMessageParam[] = [ + { role: "system", content: "You are a helpful assistant." }, + { + role: "user", + content: "Write a tweet-size vegetarian lasagna recipe for 4 people.", + }, +] +``` + +Then, start the LLM span from the trace, giving it the model and input messages: + +```typescript +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: messages + }, +}); +``` + +This will capture the LLM input and register the time the call started. Now, continue with the LLM call normally, using the same parameters: + +```typescript +const openai = new OpenAI(); +const chatCompletion = await openai.chat.completions.create({ + messages: messages, + model: model, +}); +``` + +Finally, after the OpenAI call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: [chatCompletion.choices[0]!.message], + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + +### Azure + +Start by initializing LangWatch client and creating a new trace to capture your messages: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then to capture your LLM calls, you can start an LLM span inside the trace with the input about to be sent to the LLM. + +First, define the model and the messages you are going to use for your LLM call separately, so you can capture them: + +```typescript +import { AzureOpenAI } from "openai"; + +// Model to be used and messages that will be sent to the LLM +const model = "gpt-4-turbo-2024-04-09" +const messages : OpenAI.Chat.ChatCompletionMessageParam[] = [ + { role: "system", content: "You are a helpful assistant." }, + { + role: "user", + content: "Write a tweet-size vegetarian lasagna recipe for 4 people.", + }, +] +``` + +Then, start the LLM span from the trace, giving it the model and input messages: + +```typescript +const span = trace.startLLMSpan({ + name: "llm", + model: model, + input: { + type: "chat_messages", + value: messages + }, +}); +``` + +This will capture the LLM input and register the time the call started. Now, continue with the LLM call normally, using the same parameters: + +```typescript +const openai = new AzureOpenAI({ + apiKey: process.env.AZURE_OPENAI_API_KEY, + apiVersion: "2024-02-01", + endpoint: process.env.AZURE_OPENAI_ENDPOINT, +}); +const chatCompletion = await openai.chat.completions.create({ + messages: messages, + model: model, +}); +``` + +Finally, after the OpenAI call is done, end the span to get the finish timestamp to be registered, and capture the output and the token metrics, which will be used for cost calculation: + +```typescript +span.end({ + output: { + type: "chat_messages", + value: [chatCompletion.choices[0]!.message], + }, + metrics: { + promptTokens: chatCompletion.usage?.prompt_tokens, + completionTokens: chatCompletion.usage?.completion_tokens, + }, +}); +``` + +### LangChain.js + +Start by initializing LangWatch client and creating a new trace to capture your chain: + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch(); + +const trace = langwatch.getTrace({ + metadata: { threadId: "mythread-123", userId: "myuser-123" }, +}); +``` + +Then, to capture your LLM calls and all other chain steps, LangWatch provides a callback hook for LangChain.js that automatically tracks everything for you. + +First, define your chain as you would normally do: + +```typescript +import { StringOutputParser } from '@langchain/core/output_parsers' +import { ChatPromptTemplate } from '@langchain/core/prompts' +import { ChatOpenAI } from '@langchain/openai' + +const prompt = ChatPromptTemplate.fromMessages([ + ['system', 'Translate the following from English into Italian'], + ['human', '{input}'] +]) +const model = new ChatOpenAI({ model: 'gpt-3.5-turbo' }) +const outputParser = new StringOutputParser() + +const chain = prompt.pipe(model).pipe(outputParser) +``` + +Now, when calling your chain either with `invoke` or `stream`, pass in `trace.getLangChainCallback()` as one of the callbacks: + +```typescript +const stream = await chain.stream( + { input: message }, + { callbacks: [trace.getLangChainCallback()] } +) +``` + +That's it! The full trace with all spans for each chain step will be sent automatically to LangWatch in the background on periodic intervals. After capturing your first LLM Span, go to [LangWatch Dashboard](https://app.langwatch.com), your message should be there! + + +On short-live environments like Lambdas or Serverless Functions, be sure to call
`await trace.sendSpans();` to wait for all pending requests to be sent before the runtime is destroyed. +
+ +## Capture a RAG Span + +Appart from LLM spans, another very used type of span is the RAG span. This is used to capture the retrieved contexts from a RAG that will be used by the LLM, and enables a whole new set of RAG-based features evaluations for RAG quality on LangWatch. + +To capture a RAG, you can simply start a RAG span inside the trace, giving it the input query being used: + +```typescript +const ragSpan = trace.startRAGSpan({ + name: "my-vectordb-retrieval", // optional + input: { type: "text", value: "search query" }, +}); + +// proceed to do the retrieval normally +``` + +Then, after doing the retrieval, you can end the RAG span with the contexts that were retrieved and will be used by the LLM: + +```typescript +ragSpan.end({ + contexts: [ + { + documentId: "doc1", + content: "document chunk 1", + }, + { + documentId: "doc2", + content: "document chunk 2", + }, + ], +}); +``` + + +On LangChain.js, RAG spans are captured automatically by the LangWatch callback when using LangChain Retrievers, with `source` as the documentId. + + +## Capture an arbritary Span + +You can also use generic spans to capture any type of operation, its inputs and outputs, for example for a function call: + +```typescript +// before the function starts +const span = trace.startSpan({ + name: "weather_function", + input: { + type: "json", + value: { + city: "Tokyo", + }, + }, +}); + +// ...after the function ends +span.end({ + output: { + type: "json", + value: { + weather: "sunny", + }, + }, +}); +``` + +You can also nest spans one inside the other, capturing your pipeline structure, for example: + +```typescript +const span = trace.startSpan({ + name: "pipeline", +}); + +const nestedSpan = span.startSpan({ + name: "nested_pipeline", +}); + +nestedSpan.end() + +span.end() +``` + +Both LLM and RAG spans can also be nested like any arbritary span. + +## Capturing Exceptions + +To capture also when your code throws an exception, you can simply wrap your code around a try/catch, and update or end the span with the exception: + +```typescript +try { + throw new Error("unexpected error"); +} catch (error) { + span.end({ + error: error, + }); +} +``` + +## Capturing custom evaluation results + +[LangWatch Evaluators](/evaluations/overview) can run automatically on your traces, but if you have an in-house custom evaluator, you can also capture the evaluation +results of your custom evaluator on the current trace or span by using the `.addEvaluation` method: + +```typescript +import { type LangWatchTrace } from "langwatch"; + +async function llmStep({ message, trace }: { message: string, trace: LangWatchTrace }): Promise { + const span = trace.startLLMSpan({ name: "llmStep" }); + + // ... your existing code + + span.addEvaluation({ + name: "custom evaluation", + passed: true, + score: 0.5, + label: "category_detected", + details: "explanation of the evaluation results", + }); +} +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. +--- + +# FILE: ./integration/rest-api.mdx + +--- +title: REST API Integration +--- + +If your preferred programming language or platform is not directly supported by the existing LangWatch libraries, you can use the REST API with `curl` to send trace data. This guide will walk you through how to integrate LangWatch with any system that allows HTTP requests. + +**Prerequisites:** + +- Ensure you have `curl` installed on your system. + +**Configuration:** + +Set the `LANGWATCH_API_KEY` environment variable in your environment: + +```bash +export LANGWATCH_API_KEY='your_api_key_here' +``` + +**Usage:** + +You will need to prepare your span data in accordance with the Span type definitions provided by LangWatch. Below is an example of how to send span data using curl: + + 1. Prepare your JSON data. Make sure it's properly formatted as expected by LangWatch. + 2. Use the curl command to send your trace data. Here is a basic template: + +```bash +# Set your API key and endpoint URL +LANGWATCH_API_KEY="your_langwatch_api_key" +LANGWATCH_ENDPOINT="https://app.langwatch.ai" + +# Use curl to send the POST request, e.g.: +curl -X POST "$LANGWATCH_ENDPOINT/api/collector" \ + -H "X-Auth-Token: $LANGWATCH_API_KEY" \ + -H "Content-Type: application/json" \ + -d @- < +On LangChain.js, RAG spans are captured automatically by the LangWatch callback when using LangChain Retrievers, with `source` as the documentId. + + +### REST API + +To track the RAG context when using the REST API, add a new span of type `rag`, you may also refer the LLM generation as the child of it: + +```bash +curl -X POST "https://app.langwatch.ai/api/collector" \\ + -H "X-Auth-Token: $API_KEY" \\ + -H "Content-Type: application/json" \\ + -d @- < + +In the below screenshot you will see where you can select the dataset you want to evaluate on as well as selecting which evaluations you would like to run. Each tab has different evaluation you can choose from. + +LangWatch + +In the screenshot below, you'll find a Python code snippet ready for execution to perform your batch processing. The parameters passed into the `BatchEvaluation` include your chosen dataset and an array of selected evaluations to run against it. + +LangWatch + +We've streamlined the process by setting up pandas for you, enabling seamless evaluation of datasets directly on the results object. This means you can leverage the power of pandas' data manipulation and analysis capabilities effortlessly within your evaluation workflow. With pandas at your disposal, you can efficiently explore, analyze, and manipulate your data to derive valuable insights without the need for additional setup or configuration. + +### Python snippet + +When executing the snippet, you'll encounter a callback function at your disposal. This function contains the original entry data, allowing you to run it against your own Large Language Model (LLM). You can utilize this response to compare results within your evaluation process. + +Ensure that you return the `output` as some evaluations may require it. As you create your code snippet in the evaluations tab, you'll notice indications of which evaluations necessitate particular information. Utilize this guidance as a reference to kickstart your workflow effectively. + +--- + +# FILE: ./features/triggers.mdx + +--- +title: Triggers +--- + +## Create triggers based on LangWatch filters + +LangWatch offers you the possibility to create triggers based on your selected filters. You can use these triggers to send notifications to either Slack or selected team email adresses. + +#### Usage + +To create a trigger in the LangWatch dashboard, follow these steps: + +- Click the filter button located at the top right of the LangWatch dashboard. +- After creating a filter, a trigger button will appear. +- Click the trigger button to open a popout drawer. +- In the drawer, you can configure your trigger with the desired settings. + +LangWatch +**Trigger actions** +LangWatch + +Once the trigger is created, you will receive an alert whenever a message meets the criteria of the trigger. These trigger checks are run on the minute but not instantaneously, as the data needs time to be processed. You can find the created triggers under the Settings section, where you can deactivate or delete a trigger to stop receiving notifications. + +**Trigger settings** + +LangWatch + +--- + +# FILE: ./features/embedded-analytics.mdx + +--- +title: Embedded Analytics +--- + +## Export Analytics with REST Endpoint + +LangWatch offers you the possibility to build and integrate LangWatch graph's on your own systems and applications, to display it to your customers in another interface. + +On LangWatch dashboard, you can use our powerful custom chart builder tool, to plot any data collected and generated by LangWatch, and customize the way you want to display it. You can then use our REST API to fetch the graph data. + +**Usage:** +You will need to obtain your JSON payload from the custom graph section in our application. You can find this on the Analytics page > Custom Reports > Add chart. + + 1. Pick the custom graph you want to get the analytics for. + 2. Prepare your JSON data. Make sure it's is the same format that is showing in the LangWatch application. + 3. Use the `curl` command to get you analytics data. Here is a basic template: + +```bash +# Set your API key and endpoint URL +API_KEY="your_langwatch_api_key" +ENDPOINT="https://app.langwatch.ai/api/analytics" + +# Use curl to send the POST request, e.g.: +curl -X POST "$ENDPOINT" \ + -H "X-Auth-Token: $API_KEY" \ + -H "Content-Type: application/json" \ + -d @- < + +Within this modal, you'll find the JSON payload required for the precise custom analytics +data. Simply copy this payload and paste it into the body of your REST POST request. + +LangWatch +Now you're fully prepared to access your customized analytics and seamlessly integrate +them into your specific use cases. + +If you encounter any hurdles or have questions, our support team is eager to assist you. + +--- + +# FILE: ./features/annotations.mdx + +--- +title: Annotations +--- + +## Create annotations on messages + +With annotations, you can add additional information to messages. This can be useful to comment on or add any other information that you want to add to a message for further analysis. + +We have also implemented the option to add a scoring system for each annotation, more information about this can be found in the [Annotation Scoring](/features/annotations#annotation-scoring) section + +If you want to add an annotation to a queue, you can do so by clicking on the add to queue button to send the messages to the queue for later analysis. You can create queues and add members to them on the the main annotations page. More information about this can be found in the [Annotation Queues](/features/annotations#annotation-queues) section. + +#### Usage + +To create an annotation, follow these steps: + +1) Click the message you want to annotate on and a [Trace](/concepts#traces) details drawer will open. +2) On the top right, click the annotation button. +3) Here you will be able to add a comment, a link or any other information that you want to add to the message. + + +LangWatch + + +Once you have created an annotation, you will see it next to to the message. + + +LangWatch + + +# Annotation Scoring + +We have developed a customized scoring system for each annotation. To get started, you will need to create your scores on the settings page. + +There are two types of score data you can choose from: + +- **Checkbox**: To add multiple selectable options. +- **Multiple Choice**: To add a single selectable option. + + +LangWatch + +After you have created your scores, you can activate or deactivate them on the settings page. + +LangWatch + +Once your scores are activated, you will see them in the annotations tab. For each annotation you create, the score options will be available, allowing you to add more detailed information to your annotations. +When annotating a message, you will see the score options below the comment input. Once you have added a score, you will be asked for an optional reason for the score. + +
+ LangWatch + LangWatch +
+ +Thats it! You can now annotate messages and add your custom score metrics to them. + +# Annotation Queues + +To get started with annotation queues, follow these steps: + +1) Go to the annotations page. +2) Click the plus button to create a new queue. +3) Add a name for your queue, description, members and click on the "Save" button. + +LangWatch + +Once you have created your queue, you will be able to select this when creating an annotation and send the messages to the queue or directly to a project member for later analysis. + +LangWatch + +Once you add an item to the queue, you can view it in the annotations section, whether it's in a queue or sent directly to you. + +LangWatch + +When clicking on a queue item, you will be directed to the message where you can add an annotation. Once happy with your annotation, you can click on the "Done" button and move on to the next item. + +LangWatch + +Once youโ€™ve completed the final item in the queue, youโ€™ll see that all tasks are done. Thatโ€™s it! Happy annotating! + +LangWatch + + +--- + +# FILE: ./features/datasets.mdx + +--- +title: Datasets +--- + +## Create datasets + +LangWatch offers you the possibility to create datasets on your LLM messages. These datasets can be used to train your own models or to do further analysis on the data. +We offer the possibility to create datasets with the following data types; + +- **Input**: The message input string. +- **Expected Output**: The gold-standard expected output for the given input, + useful for output-comparison metrics +- **Contexts**: The contexts provided if your are doing RAG, useful + for RAG-metric evaluations +- **[Spans](/concepts#spans)**: A JSON with all the spans contained in the message + trace, that is, all the steps in your pipeline, for + more complex evaluations +- **LLM Input**: The input the LLM received, in LLM chat history json + format +- **Expected LLM Output**: The gold-standard expected output for the given input, + in LLM chat history json format. +- **Annotation Scores**: The scores of the annotations, useful for annotation-comparison metrics +- **Evaluation Metrics**: The evaluation metrics for the dataset, useful for evaluation-comparison metrics + +#### Usage + +To create a dataset, simply go to the datasets page and click the "Create New Dataset" button. You will be able to select the type of dataset you want as well as the columns you want to include. + +LangWatch + +There are a couple ways to add data to a dataset; + +- **Manually**: You can add data on a per message basis. +- **Group selection**: You can fill the dataset by selecting a group of messages. +- **CSV Upload**: You can fill the dataset by uploading a CSV file. + +### Manually + +To add data manually, click the "Add to Dataset" button on the messages page after selecting a message. You will then be able to choose the dataset type and preview the data that will be added. + +LangWatch + +### Group selection + +To add data by selecting a group, simply click the "Add to Dataset" button after choosing the desired messages in the table view. You'll then be able to select the type of dataset you wish to add to and preview the data that will be included. + +LangWatch +### CSV Upload + +To add data by CSV upload, go to your datasets page and select the dataset you want to update. Click the "Upload CSV" button and upload your CSV file. You can then map the columns from your CSV file to the appropriate fields in the dataset based on the dataset type. + +LangWatch + +--- + +# FILE: ./evaluations/overview.mdx + +--- +title: Evaluations +--- + +LangWatch offers an extensive library of evaluators to help you evaluate the quality and guarantee the safety of your LLM apps. +Those are very easy to set up on [LangWatch dashboard](https://app.langwatch.com/). + +![Evaluators](/images/screenshot-evaluators.png) + +## Evaluators List + + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Azure Jailbreak Detection](/langevals/api-reference/endpoint/azure-jailbreak-detection) | This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API. | + | [Azure Content Safety](/langevals/api-reference/endpoint/content-safety) | This evaluator detects potentially unsafe content in text, including hate speech, self-harm, sexual content, and violence. It allows customization of the severity threshold and the specific categories to check. | + | [Google Cloud DLP PII Detection](/langevals/api-reference/endpoint/google-cloud-dlp-pii-detection) | Google DLP PII detects personally identifiable information in text, including phone numbers, email addresses, and social security numbers. It allows customization of the detection threshold and the specific types of PII to check. | + | [Llama Guard](/langevals/api-reference/endpoint/llama-guard) | This evaluator is a special version of Llama trained strictly for acting as a guardrail, following customizable guidelines. It can work both as a safety evaluator and as policy enforcement. | + | [OpenAI Moderation](/langevals/api-reference/endpoint/openai-moderation) | This evaluator uses OpenAI's moderation API to detect potentially harmful content in text, including harassment, hate speech, self-harm, sexual content, and violence. | + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Competitor LLM check](/langevals/api-reference/endpoint/competitor-detection-llm) | This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly | + | [Off Topic Evaluator](/langevals/api-reference/endpoint/off-topic-detection) | This evaluator checks if the user message is concerning one of the allowed topics of the chatbot | + | [Competitor Blocklist](/langevals/api-reference/endpoint/competitor-blocklist) | This evaluator checks if any of the specified competitors was mentioned | + | Product Sentiment Polarity | For messages about products, this evaluator checks for the nuanced sentiment direction of the LLM output, either very positive, subtly positive, subtly negative, or very negative. | + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Lingua Language Detection](/langevals/api-reference/endpoint/lingua-language-detection) | This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt, or if it's in a specific expected language. | + | Query Resolution | This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user. | + | [Ragas Context Recall](/langevals/api-reference/endpoint/ragas-context-recall) | This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance. | + | [Ragas Faithfulness](/langevals/api-reference/endpoint/ragas-faithfulness) | This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context. | + | [Ragas Context Utilization](/langevals/api-reference/endpoint/ragas-context-utilization) | This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization. | + | [Ragas Context Relevancy](/langevals/api-reference/endpoint/ragas-context-relevancy) | This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy. | + | [Ragas Context Precision](/langevals/api-reference/endpoint/ragas-context-precision) | This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision. | + | [Ragas Answer Relevancy](/langevals/api-reference/endpoint/ragas-answer-relevancy) | This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. | + + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Semantic Similarity Evaluator](/langevals/api-reference/endpoint/llm-similarity-evaluator) | Allows you to check for semantic similarity or dissimilarity between input and output and a target value, so you can avoid sentences that you don't want to be present without having to match on the exact text. | + | [Custom Basic Evaluator](/langevals/api-reference/endpoint/llm-basic-evaluator) | Allows you to check for simple text matches or regex evaluation. | + | [Custom LLM Boolean Evaluator](/langevals/api-reference/endpoint/llm-boolean-evaluator) | Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message. | + | [Custom LLM Score Evaluator](/langevals/api-reference/endpoint/llm-score-evaluator) | Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message. | + + + +## Custom Evaluator Integration + +If you have a custom evaluator built in-house, you can follow the guide below to integrate. + + + + + +--- + +# FILE: ./evaluations/custom-evaluator-integration.mdx + +--- +title: Custom Evaluator Integration +--- + +If you have a custom evaluator built in-house which run on your own code, either during the LLM pipeline or after, you can still capture the evaluation results +and connect it back to the trace to visualize it together with the other LangWatch evaluators. + +### Python + + +You can capture the evaluation results of your custom evaluator on the current trace or span by using the `.add_evaluation` method: + +```python +import langwatch + +@langwatch.span(type="evaluation") +def evaluation_step(): + ... # your custom evaluation logic + + langwatch.get_current_span().add_evaluation( + name="custom evaluation", # required + passed=True, + score=0.5, + label="category_detected", + details="explanation of the evaluation results", + ) +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. + + +### TypeScript + + +You can capture the evaluation results of your custom evaluator on the current trace or span by using the `.addEvaluation` method: + +```typescript +import { type LangWatchTrace } from "langwatch"; + +async function llmStep({ message, trace }: { message: string, trace: LangWatchTrace }): Promise { + const span = trace.startLLMSpan({ name: "llmStep" }); + + // ... your existing code + + span.addEvaluation({ + name: "custom evaluation", + passed: true, + score: 0.5, + label: "category_detected", + details: "explanation of the evaluation results", + }); +} +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. + + +### REST API + + +## REST API Specification + +### Endpoint + +`POST /api/collector` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the evaluation was run on", + "evaluations": [{ + "evaluation_id": "evaluation-id-123", // optional unique id for identifying the evaluation, if not provided, a random id will be generated + "name": "custom evaluation", // required + "passed": true, // optional + "score": 0.5, // optional + "label": "category_detected", // optional + "details": "explanation of the evaluation results", // optional + "error": { // optional to capture error details in case evaluation had an error + "message": "error message", + "stacktrace": [], + }, + "timestamps": { // optional + "created_at": "1723411698506", // unix timestamp in milliseconds + "updated_at": "1723411698506" // unix timestamp in milliseconds + } + }] +} +``` + + +--- + +# FILE: ./guardrails/overview.mdx + +--- +title: Overview +--- + +Learn how you can protect your LLM application from costly mistakes by setting up guardrails. + + + + + +--- + +# FILE: ./guardrails/setting-up-guardrails.mdx + +--- +title: Setting Up Guardrails +--- + +Guardrails are protections you can add around your LLM calls, either before calling the LLM, for example to prevent jailbreaking; after calling an LLM, for example to verify if the generated output does not contain toxic language or leaking PII; or to steer the LLM in a different direction, for example when detecting a user is going off-topic or talking about competition, in which you might want to throw them in a different flow. + +Setting up Guardrails is quite easy, first, go to the Evaluation and Guardrails area on your [LangWatch dashboard](https://app.langwatch.ai), press + Add, and look for evaluators with the shield icon, those evaluators are the ones that support acting as Guardrails: + +Guardrails + +Then, change the Execution Mode to "As a Guardrail", on the page itself, you will see the instructions on how to integrate the guardrail to your code, after following the instructions, don't forget to click "Save" to create the Guardrail before trying it out. + +Guardrails + +Back to the Guardrail setup, you can also try it out on the messages already on LangWatch, to verify if the Guardrail is working well, of it some adjustments are needed, using the Try it out section: + +Guardrails + +You are now ready to keep your LLM protected and steer the conversation in the right direction with LangWatch Guardrails! Follow the next guides for examples on how to use Guardrails for handling different situations, and more advanced use cases. + +## What's next? + +- (In progress) Using guardrails to prevent bad inputs from the LLM +- (In progress) Using guardrails to prevent bad outputs from the LLM to the user +- (In progress) Steering the conversation with another LLM call from the guardrail +- (In progress) Handling multiple guardrail calls in parallel +- (In progress) Speculative execution of the LLM in parallel to the guardrail call + +--- diff --git a/llms.txt.json b/llms.txt.json new file mode 100644 index 000000000..612603129 --- /dev/null +++ b/llms.txt.json @@ -0,0 +1,16 @@ +{ + "includePaths": [ + "introduction.mdx", + "concepts.mdx", + "integration/overview.mdx", + "integration/python/*.mdx", + "integration/typescript/*.mdx", + "integration/rest-api.mdx", + "integration/opentelemetry/*.mdx", + "integration/rags-context-tracking.mdx", + "features/*.mdx", + "evaluations/*.mdx", + "guardrails/*.mdx" + ], + "excludePaths": ["**/node_modules/**"] +} diff --git a/llms.txt.sh b/llms.txt.sh new file mode 100755 index 000000000..6d24306d0 --- /dev/null +++ b/llms.txt.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env node + +const fs = require('fs'); +const path = require('path'); +const { execSync } = require('child_process'); + +// Read the configuration from llms.txt.json +const config = JSON.parse(fs.readFileSync('llms.txt.json', 'utf8')); +const includePaths = config.includePaths; +const excludePaths = config.excludePaths || []; + +// Output file +const outputFile = 'llms.txt'; + +// Clear the output file if it exists +fs.writeFileSync(outputFile, "# LangWatch\n\n"); + +// Function to process imports in an MDX file +function processImports(content, filePath) { + // Find all import statements + const importRegex = /import\s+(\w+)\s+from\s+["']([^"']+)["'];?/g; + let modifiedContent = content; + const imports = {}; + + // Extract all imports + let match; + while ((match = importRegex.exec(content)) !== null) { + const importName = match[1]; + const importPath = match[2]; + + // Handle only imports from /snippets + if (importPath.startsWith('/snippets/')) { + const absoluteImportPath = path.join(process.cwd(), importPath.substring(1)); + + try { + if (fs.existsSync(absoluteImportPath)) { + // Read the imported file + const importedContent = fs.readFileSync(absoluteImportPath, 'utf8'); + imports[importName] = importedContent; + if (importName == "LLMsTxtProtip") { + imports[importName] = "" + } + } else { + console.warn(`Warning: Import file not found: ${absoluteImportPath}`); + } + } catch (err) { + console.error(`Error reading import file ${absoluteImportPath}: ${err.message}`); + } + } + } + + // Replace component references with their content + Object.keys(imports).forEach(componentName => { + // Simple replacement for pattern + const componentRegex = new RegExp(`<${componentName}\\s*\\/>`, 'g'); + modifiedContent = modifiedContent.replace(componentRegex, imports[componentName]); + }); + + // Remove import statements + modifiedContent = modifiedContent.replace(importRegex, ''); + + // Replace with ### X + modifiedContent = modifiedContent.replace(//g, '### $1\n'); + + // Remove tags + modifiedContent = modifiedContent.replace(/<\/Tab>/g, ''); + + // Remove and tags + modifiedContent = modifiedContent.replace(/|<\/Tabs>/g, ''); + + // Remove too many sequential newlines + modifiedContent = modifiedContent.replace(/\n\n\n\n+/g, '\n\n'); + + return modifiedContent; +} + +// Process each include path +includePaths.forEach(includePath => { + try { + // Create a find command to locate the files + let findCmd = `find . -type f -path "./${includePath}" 2>/dev/null || echo ""`; + + // Add exclude patterns if any + if (excludePaths.length > 0) { + excludePaths.forEach(excludePath => { + findCmd += ` | grep -v "${excludePath}"`; + }); + } + + // Execute the find command + const files = execSync(findCmd) + .toString() + .trim() + .split('\n') + .filter(file => file); // Remove empty lines + + // Process each matching file + files.forEach(file => { + console.log(`Processing: ${file}`); + try { + let content = fs.readFileSync(file, 'utf8'); + + // Process imports for MDX files + if (file.endsWith('.mdx')) { + content = processImports(content, file); + } + + // Remove trailing whitespaces + content = content.replace(/[ \t]+$/gm, ''); + + // Append to output file + fs.appendFileSync(outputFile, `# FILE: ${file}\n\n`); + fs.appendFileSync(outputFile, content); + fs.appendFileSync(outputFile, '\n---\n\n'); + } catch (err) { + console.error(`Error reading ${file}: ${err.message}`); + } + }); + } catch (error) { + // If there's an error with the command, log and continue + console.log(`Error with pattern: ${includePath}: ${error.message}`); + } +}); + +// Remove extra blank line at EOF +let finalContent = fs.readFileSync(outputFile, 'utf8'); +if (finalContent.endsWith('\n\n')) { + finalContent = finalContent.substring(0, finalContent.length - 1); + fs.writeFileSync(outputFile, finalContent); +} + +console.log(`Done! All matching files have been merged into ${outputFile}`); diff --git a/logo/dark.svg b/logo/dark.svg index a6283786c..6f2b41212 100644 --- a/logo/dark.svg +++ b/logo/dark.svg @@ -1,55 +1,13 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + diff --git a/logo/light.svg b/logo/light.svg index 582b3b95f..0cf260fa3 100644 --- a/logo/light.svg +++ b/logo/light.svg @@ -1,51 +1,13 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + diff --git a/mint.json b/mint.json index d6f7e33d6..1c9a9908f 100644 --- a/mint.json +++ b/mint.json @@ -1,91 +1,230 @@ { "$schema": "https://mintlify.com/schema.json", - "name": "Starter Kit", + "name": "LangWatch", "logo": { "dark": "/logo/dark.svg", "light": "/logo/light.svg" }, "favicon": "/favicon.svg", "colors": { - "primary": "#0D9373", - "light": "#07C983", - "dark": "#0D9373", + "primary": "#ed8926", + "light": "#f2a65a", + "dark": "#ed8926", "anchors": { - "from": "#0D9373", - "to": "#07C983" + "from": "#ed8926", + "to": "#ed8926" } }, "topbarLinks": [ + { + "name": "llms.txt", + "url": "https://docs.langwatch.ai/llms.txt" + }, { "name": "Support", - "url": "mailto:hi@mintlify.com" + "url": "/support#email-support" + }, + { + "name": "Dashboard", + "url": "https://app.langwatch.ai" } ], "topbarCtaButton": { - "name": "Dashboard", - "url": "https://dashboard.mintlify.com" + "type": "github", + "url": "https://github.com/langwatch/langwatch" }, "tabs": [ { - "name": "API Reference", - "url": "api-reference" + "name": "LangEvals", + "url": "langevals" } ], "anchors": [ { - "name": "Documentation", - "icon": "book-open-cover", - "url": "https://mintlify.com/docs" - }, - { - "name": "Community", - "icon": "slack", - "url": "https://mintlify.com/community" + "name": "Open Dashboard", + "icon": "link", + "url": "https://app.langwatch.ai" }, { - "name": "Blog", - "icon": "newspaper", - "url": "https://mintlify.com/blog" + "name": "GitHub Repo", + "icon": "github", + "url": "https://github.com/langwatch/langwatch" } ], "navigation": [ { - "group": "Get Started", + "group": "", + "pages": ["introduction", "self-hosting"] + }, + { + "group": "Optimization Studio", "pages": [ - "introduction", - "quickstart", - "development" + "optimization-studio/overview", + "optimization-studio/llm-nodes", + "optimization-studio/datasets", + "optimization-studio/evaluating", + "optimization-studio/optimizing" ] }, { - "group": "Essentials", + "group": "Monitoring Integration", "pages": [ - "essentials/markdown", - "essentials/code", - "essentials/images", - "essentials/settings", - "essentials/navigation", - "essentials/reusable-snippets" + "integration/overview", + { + "group": "Python", + "pages": ["integration/python/guide", "integration/python/reference"] + }, + { + "group": "TypeScript", + "pages": ["integration/typescript/guide"] + }, + { + "group": "OpenTelemetry", + "pages": ["integration/opentelemetry/guide"] + }, + "integration/langflow", + "integration/flowise", + "integration/rest-api", + "integration/rags-context-tracking", + "concepts", + "integration/cookbooks", + "integration/mcp" ] }, { - "group": "API Documentation", + "group": "Evaluations", "pages": [ - "api-reference/introduction" + "evaluations/overview", + "evaluations/custom-evaluator-integration" ] }, { - "group": "Endpoint Examples", + "group": "Guardrails", + "pages": ["guardrails/overview", "guardrails/setting-up-guardrails"] + }, + { + "group": "User Events", "pages": [ - "api-reference/endpoint/get", - "api-reference/endpoint/create", - "api-reference/endpoint/delete" + "user-events/overview", + { + "group": "Events", + "pages": [ + "user-events/thumbs-up-down", + "user-events/waited-to-finish", + "user-events/selected-text", + "user-events/custom" + ] + } ] + }, + { + "group": "DSPy Visualization", + "pages": [ + "dspy-visualization/quickstart", + "dspy-visualization/custom-optimizer", + "dspy-visualization/rag-visualization" + ] + }, + { + "group": "More Features", + "pages": [ + "features/triggers", + "features/annotations", + "features/datasets", + "features/embedded-analytics" + ] + }, + { + "group": "API Endpoints", + "pages": [ + { + "group": "Traces", + "pages": [ + "api-reference/traces/overview", + "api-reference/traces/get-trace-details", + "api-reference/traces/search-traces", + "api-reference/traces/create-public-trace-path", + "api-reference/traces/delete-public-trace-path" + ] + }, + { + "group": "Annotations", + "pages": [ + "api-reference/annotations/overview", + "api-reference/annotations/get-annotation", + "api-reference/annotations/get-single-annotation", + "api-reference/annotations/delete-annotation", + "api-reference/annotations/patch-annotation", + "api-reference/annotations/get-all-annotations-trace", + "api-reference/annotations/create-annotation-trace" + ] + }, + { + "group": "Datasets", + "pages": [ + "api-reference/datasets/post-dataset-entries" + ] + } + ] + }, + { + "group": "Documentation", + "pages": [ + "langevals/documentation/introduction", + "langevals/documentation/evaluators", + "langevals/documentation/unit-tests", + "langevals/documentation/API-example", + "langevals/how-to-choose-your-evaluator", + { + "group": "Modular Architecture", + "pages": ["langevals/documentation/modular-architecture/contributing"] + } + ] + }, + { + "group": "Tutorials", + "pages": [ + "langevals/tutorials/extensive-unit-testing", + "langevals/tutorials/rag-evaluation", + "langevals/tutorials/ci-cd-pipeline-evaluation" + ] + }, + { + "group": "API Endpoints", + "pages": [ + "langevals/api-reference/endpoint/lingua-language-detection", + "langevals/api-reference/endpoint/openai-moderation", + "langevals/api-reference/endpoint/google-cloud-dlp-pii-detection", + "langevals/api-reference/endpoint/content-safety", + "langevals/api-reference/endpoint/azure-jailbreak-detection", + "langevals/api-reference/endpoint/azure-prompt-injection-detection", + "langevals/api-reference/endpoint/competitor-detection-llm", + "langevals/api-reference/endpoint/competitor-blocklist", + "langevals/api-reference/endpoint/off-topic-detection", + "langevals/api-reference/endpoint/ragas-answer-relevancy", + "langevals/api-reference/endpoint/ragas-context-precision", + "langevals/api-reference/endpoint/ragas-context-recall", + "langevals/api-reference/endpoint/ragas-context-relevancy", + "langevals/api-reference/endpoint/ragas-context-utilization", + "langevals/api-reference/endpoint/ragas-faithfulness", + "langevals/api-reference/endpoint/haystack-faithfulness", + "langevals/api-reference/endpoint/llm-boolean-evaluator", + "langevals/api-reference/endpoint/llm-score-evaluator", + "langevals/api-reference/endpoint/llm-basic-evaluator", + "langevals/api-reference/endpoint/llm-similarity-evaluator", + "langevals/api-reference/endpoint/llama-guard" + ] + }, + { + "group": "Support", + "pages": ["support", "status"] } ], "footerSocials": { - "x": "https://x.com/mintlify", - "github": "https://github.com/mintlify", - "linkedin": "https://www.linkedin.com/company/mintsearch" + "github": "https://github.com/langwatch/langwatch", + "linkedin": "https://www.linkedin.com/company/langwatch/" + }, + "modeToggle": { + "default": "light" } -} \ No newline at end of file +} diff --git a/optimization-studio/datasets.mdx b/optimization-studio/datasets.mdx new file mode 100644 index 000000000..7ceeef1ff --- /dev/null +++ b/optimization-studio/datasets.mdx @@ -0,0 +1,84 @@ +--- +title: Datasets +--- + + + +## Understanding the Role of Datasets + +Datasets are at the core of the Optimization Studio's functionality. When working with non-deterministic systems like LLMs, running your tests across multiple examples is crucial for confidence in your results. While you might get lucky with a single successful test, running your LLM against hundreds of examples provides much more reliable validation of your solution. + +The good news is that you don't need an enormous dataset to get started. As little as 20 examples can already provide meaningful results with the DSPy optimizers, thanks to their intelligent use of LLM capabilities. + +## Creating and Managing Datasets [(0:50)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=50s) + +If you already use LangWatch for monitoring, you can import the production data generated by your LLMs as a dataset, otherwise, you can also create or import a new dataset on optimization studio directly. + +### Creating and Editing Datasets + +Access the dataset editor by double-clicking on the dataset in the node or sidebar, this provides a spreadsheet-like interface where you can: +- Add new records manually or modify existing entries +- Add or remove columns +- Make real-time changes to experiment on your workflow +- Collaborate with team members and domain experts + + +### Importing Existing Data [(1:45)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=105s) +If you already have data in CSV format, you can easily import it: +1. Use the upload CSV option +2. Configure column types and formats +3. Add additional columns as needed +4. Save and immediately use in your workflows + +## Dataset Configuration [(2:23)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=143s) + +### Manual Test Entry Settings +The "manual test entry" setting controls which data point is used during manual execution: +- "Random" (default): Picks a different entry each time you run +- "First Entry": Always uses the same entry for consistent testing +- This setting only affects manual testing, not full evaluations + +### Dataset Splitting [(2:52)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=172s) + +One of the most important aspects of working with datasets is how they're split for optimization and testing: + +#### Default 80-20 Split +- Optimization Set (80%): Used for training and improving your LLM pipeline +- Test Set (20%): Reserved for validation to ensure your optimizations generalize well + +You can adjust this split based on your needs: +- Use fixed numbers instead of percentages +- Modify the split ratio for different use cases +- Balance between optimization data and test data + +### Shuffle Seed Configuration [(3:51)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=231s) + +The shuffle seed is crucial for maintaining consistent, unbiased testing: +- Prevents dataset ordering bias +- Ensures consistent splitting across runs +- Can be modified to test resilience to different data arrangements +- The default 42 seed can be changed to any number for randomization + +## Evaluation Basics [(4:56)](https://www.youtube.com/watch?v=BnHQkZlCJLw&t=296s) + +While detailed evaluation is covered in later tutorials, the basic workflow involves: +1. Clicking the Evaluate button +2. Documenting changes made to your pipeline +3. Selecting which dataset partition to evaluate against +4. Adding necessary LLM API keys + +The evaluation panel provides: +- Total entries processed +- Average cost per entry +- Total runtime +- Overall experiment costs + +This foundation in dataset management sets you up for evaluating the quality and running automated optimizations, which are covered in subsequent tutorials. \ No newline at end of file diff --git a/optimization-studio/evaluating.mdx b/optimization-studio/evaluating.mdx new file mode 100644 index 000000000..47e14f645 --- /dev/null +++ b/optimization-studio/evaluating.mdx @@ -0,0 +1,79 @@ +--- +title: Evaluating +--- + + + +## The Importance of Evaluation + +Evaluators are essential tools for measuring LLM output quality. When you have a reliable way to measure quality, it becomes much easier to: +- Compare different LLM models +- Test prompt variations +- Validate feature additions +- Ensure quality remains consistent during upgrades + +## Types of Evaluators + +On the video, a few evaluators are introduced: + +### Exact Match Evaluator [(0:56)](https://www.youtube.com/watch?v=-sQikz38yBI&t=56s) +The simplest form of evaluation, perfect for classification tasks: +- Compares LLM output directly with expected output +- Uses straightforward string matching +- Ideal for categorical outputs where precision is crucial +- Works well when you need strict matching + +### Answer Correctness Evaluator [(4:44)](https://www.youtube.com/watch?v=-sQikz38yBI&t=264s) +Comparison with golden answers for factual accuracy: +- Uses another LLM to assess if answers are factually equivalent +- Looks beyond exact wording to evaluate semantic meaning +- Particularly useful for QA systems and knowledge-based tasks +- Can handle variations in phrasing while maintaining accuracy checking + +### LLM as Judge Evaluator [(7:01)](https://www.youtube.com/watch?v=-sQikz38yBI&t=421s) +Flexible evaluation for custom criteria: +- Allows custom prompts to define evaluation criteria +- Useful when you don't have expected outputs +- Can evaluate subjective qualities (conciseness, tone, style) +- Returns boolean (true/false) or scored (0-1) results + +## Working with Evaluators + +### Setting Up Evaluators [(1:32)](https://www.youtube.com/watch?v=-sQikz38yBI&t=92s) +To implement an evaluator: +1. Drag and drop the desired evaluator onto your workflow +2. Connect appropriate inputs (output from LLM, expected output from dataset) +3. Configure any additional parameters or criteria +4. Run evaluation on individual examples or full test sets + +### Running Evaluations [(2:28)](https://www.youtube.com/watch?v=-sQikz38yBI&t=148s) +The evaluation process: +1. Select your test dataset +2. Choose appropriate evaluator +3. Run evaluation across all test examples +4. Review accuracy scores and individual results + +### Improving Results [(9:14)](https://www.youtube.com/watch?v=-sQikz38yBI&t=554s) +After setting up evaluation: +- Make incremental changes to your workflow +- Test impact immediately through re-evaluation +- Track improvements in accuracy scores +- Iterate on prompts and parameters based on results + +## Summary + +- Choose evaluators that match your quality criteria +- Use multiple evaluators for different aspects of quality +- Start with simple evaluators before moving to complex ones +- Consider both strict and semantic matching depending on your use case +- Use evaluation results to guide optimization efforts + +The ability to properly evaluate LLM outputs sets the foundation for automated optimization, which will be covered in the next tutorial. \ No newline at end of file diff --git a/optimization-studio/llm-nodes.mdx b/optimization-studio/llm-nodes.mdx new file mode 100644 index 000000000..1f34724a9 --- /dev/null +++ b/optimization-studio/llm-nodes.mdx @@ -0,0 +1,86 @@ +--- +title: LLM Nodes +--- + + + +## Getting Started + +To begin working with LLM nodes, first create a new workflow by navigating to the workflows page and clicking "Create New Workflow." You can choose from available templates, but for learning purposes, the blank template is a good starting point. After naming your workflow, the system automatically creates three basic blocks: an entry node, an LLM call node, and an end node. + +## Understanding the LLM Node [(0:34)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=34s) + +The LLM node is where the actual language model interaction happens. Each node has configurable properties accessible through the right sidebar, including: +- LLM provider selection +- LLM Instructions +- Input and output fields +- Few-shot demonstrations + +You can quickly test an LLM node by using the "Run with manual input" option, which allows you to input test queries and see immediate results. The system will show you both the cost and duration of each execution. + +## Configuring Input and Output Fields [(1:31)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=91s) + +One of the most important aspects of the LLM node is how you configure its inputs and outputs. The field names are meaningful as they're passed directly to the LLM. You can: +- Add multiple input fields (such as 'purchase' and 'amount') +- Create custom output fields for different types of responses +- Rename fields to better represent their purpose + +## Working with Datasets [(2:10)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=130s) + +LLM nodes become particularly powerful when connected to datasets. Through the entry node, you can: +- Select and load your datasets +- Map dataset fields to LLM input fields +- Test your workflow using random samples from your dataset +- Connect multiple dataset fields to provide richer context to your LLM + +## Improving Results with Instructions [(2:58)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=178s) + +To get better responses from your LLM, you can add specific instructions in the node properties. These instructions help guide the LLM's behavior and can include: +- Expected output categories +- Format specifications +- Processing guidelines +- Context information + +## Creating Complex Workflows [(4:25)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=265s) + +You're not limited to single LLM nodes. You can create sophisticated workflows by: +- Connecting multiple LLM nodes in sequence +- Passing outputs from one node as inputs to another +- Using different LLM models for different tasks +- Adjusting temperature and other parameters independently for each node + +## Monitoring and Tracking [(5:54)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=354s) + +Every LLM node execution is tracked in detail. You can: +- View the full execution trace in LangWatch trace monitoring +- Examine system prompts and user requests +- Track costs and performance metrics +- Analyze the complete message flow + +## Using Demonstrations [(6:58)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=418s) + +To improve your LLM's performance, you can provide example cases through demonstrations. In the node properties, you can: +- Add input-output pairs as examples +- Save demonstrations for reuse +- Test how different examples affect results + +## Custom LLM Providers [(7:27)](https://www.youtube.com/watch?v=fbNmm6qoZFw&t=447s) + +You're not limited to default LLM providers. You can set up custom providers by: +1. Accessing the "Configure available model" settings +2. Enabling custom settings +3. Adding your API keys +4. Configuring custom or fine-tuned models + +Appart from the main providers (OpenAI, Anthropic, Google, Groq, etc) the system also supports any OpenAI-compatible APIs, for example on your own self-hosted Llama. + + +This is just the beginning of the Optimization Studio. The LLM Node serves as the foundation for more advanced features like image processing, evaluation, and automatic optimization, which are covered in the next tutorials. diff --git a/optimization-studio/optimizing.mdx b/optimization-studio/optimizing.mdx new file mode 100644 index 000000000..e4476f001 --- /dev/null +++ b/optimization-studio/optimizing.mdx @@ -0,0 +1,99 @@ +--- +title: Optimizing +--- + + + +## Optimizing LLM Workflows in LangWatch + +The Optimization Studio provides the power of DSPy optimizers to improve your LLM workflow performance. Starting from a basic setup with baseline performance, you can significantly enhance results through automated optimization techniques. + +## Getting Started with Optimization + +To begin optimization: +1. Set up your basic workflow with an LLM node +2. Connect your dataset +3. Add appropriate evaluators +4. Click the "Optimize" button + +## Optimization Options [(0:47)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=47s) + +The platform offers different optimization strategies such as: +- Improving prompts and demonstrations with MIPROv2 +- Prompt-only optimization with MIPROv2 +- Demonstrations optimization with BootstrapFewShotWithRandomSearch + +## Configuration Settings [(1:07)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=67s) + +Key optimization parameters include: +- Number of prompts to generate +- Number of demonstrations to bootstrap +- Teacher LLM selection (can use a more powerful LLM to teach a cheaper one) +- Optimization budget and constraints + +## Monitoring Optimization Progress [(1:55)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=115s) + +During optimization: +- View real-time progress in the optimization window +- Monitor score improvements +- Access detailed logs of the optimization process +- Track cost and performance metrics + +## Understanding Results [(2:14)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=134s) + +The optimization process typically shows: +- Initial baseline performance +- Progressive improvements +- Final optimized results +- Detailed breakdown of changes made + +## Applying and Managing Optimizations [(2:29)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=149s) + +After optimization: +- Apply optimized settings with one click +- Review new instructions and demonstrations +- Test individual examples +- Run evaluation on test set to validate improvements + +## Advanced Optimization Strategies [(4:11)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=251s) + +To further improve results: +- Try different LLM models +- Add prompting techniques (like chain of thought) +- Combine multiple optimization approaches +- Experiment with different demonstration sets + +## Cost Considerations [(7:37)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=457s) + +Important factors to consider: +- Optimization costs vs. inference costs +- Trade-offs between model performance and expense +- Tracking costs per call +- Balancing quality and budget requirements + +## Best Practices [(8:08)](https://www.youtube.com/watch?v=QGUTiAX64aE&t=488s) + +For optimal results: +1. Start with smaller datasets and lighter models +2. Gradually increase complexity +3. Monitor costs and performance metrics +4. Test different model combinations +5. Use optimization results to make informed decisions about model selection + +## Tips for Success + +- Begin with a clear baseline measurement +- Use appropriate evaluators for your use case +- Consider both quality and cost metrics +- Iterate and experiment with different approaches +- Keep track of optimization history for comparison + +The Optimization Studio provides a systematic way to improve your LLM workflows, allowing you to find the optimal balance between performance and cost for your specific use case. \ No newline at end of file diff --git a/optimization-studio/overview.mdx b/optimization-studio/overview.mdx new file mode 100644 index 000000000..7fdad0876 --- /dev/null +++ b/optimization-studio/overview.mdx @@ -0,0 +1,20 @@ +The Optimization Studio is your laboratory to create, evaluate, and optimize your LLM workflows. Check out the introduction video below and the video tutorials on the next page to learn how to really measure the quality of your LLMs and optimize it to extract the best performance possible.applications + + + +
+ + + + + + + diff --git a/quickstart.mdx b/quickstart.mdx deleted file mode 100644 index d7f348678..000000000 --- a/quickstart.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -title: 'Quickstart' -description: 'Start building awesome documentation in under 5 minutes' ---- - -## Setup your development - -Learn how to update your docs locally and and deploy them to the public. - -### Edit and preview - - - - During the onboarding process, we created a repository on your Github with - your docs content. You can find this repository on our - [dashboard](https://dashboard.mintlify.com). To clone the repository - locally, follow these - [instructions](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) - in your terminal. - - - Previewing helps you make sure your changes look as intended. We built a - command line interface to render these changes locally. 1. Install the - [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the - documentation changes locally with this command: ``` npm i -g mintlify ``` - 2. Run the following command at the root of your documentation (where - `mint.json` is): ``` mintlify dev ``` - - - -### Deploy your changes - - - - - Our Github app automatically deploys your changes to your docs site, so you - don't need to manage deployments yourself. You can find the link to install on - your [dashboard](https://dashboard.mintlify.com). Once the bot has been - successfully installed, there should be a check mark next to the commit hash - of the repo. - - - [Commit and push your changes to - Git](https://docs.github.com/en/get-started/using-git/pushing-commits-to-a-remote-repository#about-git-push) - for your changes to update in your docs site. If you push and don't see that - the Github app successfully deployed your changes, you can also manually - update your docs through our [dashboard](https://dashboard.mintlify.com). - - - - -## Update your docs - -Add content directly in your files with MDX syntax and React components. You can use any of our components, or even build your own. - - - - - Add flair to your docs with personalized branding. - - - - Implement your OpenAPI spec and enable API user interaction. - - - - Draw insights from user interactions with your documentation. - - - - Keep your docs on your own website's subdomain. - - - diff --git a/self-hosting.mdx b/self-hosting.mdx new file mode 100644 index 000000000..8ff956889 --- /dev/null +++ b/self-hosting.mdx @@ -0,0 +1,48 @@ +--- +title: Self-Hosting +--- + +LangWatch offers a fully self-hosted version of the platform for companies that require strict data control and compliance. + +LangWatch On Prem + +## Enterprise OnPrem Solution + +For organizations that require the data to never leave their own infrastructure, we offer a managed on-premise service. + +If you need to manage the installation yourself for compliance reasons, you can follow our detailed deployment instructions which we will provide after an [onboarding call](https://meetings-eu1.hubspot.com/manouk-draisma). However, granting our team temporary access to an isolated area of your infrastructure can streamline the process, ensuring a quick and efficient setup of LangWatch on your premises. +On Prem Clouds + +### Key Features + +- Self-hosted on your AWS, Google Cloud, or Azure instances +- Full control over your data +- Feature parity with the SaaS version +- Scalable to enterprise needs +- Hosted in preferred region (e.g. eu-central-1 for GDPR compliance) +- Installation and maintenance service +- Dedicated support agent to help you optimize your deployment +- Purchase and billing option through AWS Marketplace to facilitate procurement + +### Setup + +Please schedule a free consultation with our team to get started on the On-Prem setup: + +
+[Schedule a Call](https://get.langwatch.ai/request-a-demo) +
+ +## Local Testing Environment + +For testing purposes, we provide a docker-compose setup for a local testing environment that can be installed on your local machine. This allows you to test LangWatch for sensitive data without having to install it on your production environment. + +**Note:** This setup is not scalable for production workloads. For production deployments, please consider our [Cloud](https://app.langwatch.ai) or [Enterprise On-Premises solution](#enterprise-on-premises-solution). + +### Setup + +At this moment, we are looking for feedback from our users on the docker-compose setup and understanding your use case. +Please schedule a call with our team to get a docker-compose setup for your local machine: + +
+[Schedule a Call](https://get.langwatch.ai/request-a-demo) +
\ No newline at end of file diff --git a/snippets/evaluators-list.mdx b/snippets/evaluators-list.mdx new file mode 100644 index 000000000..064beb693 --- /dev/null +++ b/snippets/evaluators-list.mdx @@ -0,0 +1,40 @@ + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Azure Jailbreak Detection](/langevals/api-reference/endpoint/azure-jailbreak-detection) | This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API. | + | [Azure Content Safety](/langevals/api-reference/endpoint/content-safety) | This evaluator detects potentially unsafe content in text, including hate speech, self-harm, sexual content, and violence. It allows customization of the severity threshold and the specific categories to check. | + | [Google Cloud DLP PII Detection](/langevals/api-reference/endpoint/google-cloud-dlp-pii-detection) | Google DLP PII detects personally identifiable information in text, including phone numbers, email addresses, and social security numbers. It allows customization of the detection threshold and the specific types of PII to check. | + | [Llama Guard](/langevals/api-reference/endpoint/llama-guard) | This evaluator is a special version of Llama trained strictly for acting as a guardrail, following customizable guidelines. It can work both as a safety evaluator and as policy enforcement. | + | [OpenAI Moderation](/langevals/api-reference/endpoint/openai-moderation) | This evaluator uses OpenAI's moderation API to detect potentially harmful content in text, including harassment, hate speech, self-harm, sexual content, and violence. | + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Competitor LLM check](/langevals/api-reference/endpoint/competitor-detection-llm) | This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly | + | [Off Topic Evaluator](/langevals/api-reference/endpoint/off-topic-detection) | This evaluator checks if the user message is concerning one of the allowed topics of the chatbot | + | [Competitor Blocklist](/langevals/api-reference/endpoint/competitor-blocklist) | This evaluator checks if any of the specified competitors was mentioned | + | Product Sentiment Polarity | For messages about products, this evaluator checks for the nuanced sentiment direction of the LLM output, either very positive, subtly positive, subtly negative, or very negative. | + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Lingua Language Detection](/langevals/api-reference/endpoint/lingua-language-detection) | This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt, or if it's in a specific expected language. | + | Query Resolution | This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user. | + | [Ragas Context Recall](/langevals/api-reference/endpoint/ragas-context-recall) | This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance. | + | [Ragas Faithfulness](/langevals/api-reference/endpoint/ragas-faithfulness) | This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context. | + | [Ragas Context Utilization](/langevals/api-reference/endpoint/ragas-context-utilization) | This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization. | + | [Ragas Context Relevancy](/langevals/api-reference/endpoint/ragas-context-relevancy) | This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy. | + | [Ragas Context Precision](/langevals/api-reference/endpoint/ragas-context-precision) | This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision. | + | [Ragas Answer Relevancy](/langevals/api-reference/endpoint/ragas-answer-relevancy) | This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. | + + + + | Evaluator | Description | + | -----------------------------------------|----------------------------| + | [Semantic Similarity Evaluator](/langevals/api-reference/endpoint/llm-similarity-evaluator) | Allows you to check for semantic similarity or dissimilarity between input and output and a target value, so you can avoid sentences that you don't want to be present without having to match on the exact text. | + | [Custom Basic Evaluator](/langevals/api-reference/endpoint/llm-basic-evaluator) | Allows you to check for simple text matches or regex evaluation. | + | [Custom LLM Boolean Evaluator](/langevals/api-reference/endpoint/llm-boolean-evaluator) | Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message. | + | [Custom LLM Score Evaluator](/langevals/api-reference/endpoint/llm-score-evaluator) | Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message. | + + \ No newline at end of file diff --git a/snippets/llms-txt-protip.mdx b/snippets/llms-txt-protip.mdx new file mode 100644 index 000000000..4dd3a31d1 --- /dev/null +++ b/snippets/llms-txt-protip.mdx @@ -0,0 +1 @@ +Protip: wanna to get started even faster? Copy our llms.txt and ask an AI to do this integration \ No newline at end of file diff --git a/snippets/metadata.mdx b/snippets/metadata.mdx new file mode 100644 index 000000000..8e50f45a4 --- /dev/null +++ b/snippets/metadata.mdx @@ -0,0 +1,4 @@ +It's optional but highly recommended to pass the `user_id` on +the metadata if you want to leverage user-specific analytics and the +`thread_id` to group related traces together. To connect it to +an event later on. Read more about those and other concepts [here](../concepts) diff --git a/snippets/openinference-metadata.mdx b/snippets/openinference-metadata.mdx new file mode 100644 index 000000000..34dca77fe --- /dev/null +++ b/snippets/openinference-metadata.mdx @@ -0,0 +1,16 @@ +## Capturing Metadata + +You can use OpenInference's `using_attributes` context manager to capture additional information for your LLM calls, such as the user_id, session_id (equivalent to thread id), tags and metadata: + +```python +from openinference.instrumentation import using_attributes + +def main(): + with using_attributes( + session_id="my-test-session", + user_id="my-test-user", + tags=["tag-1", "tag-2"], + metadata={"foo": "bar"}, + ): + # Your LLM call +``` \ No newline at end of file diff --git a/snippets/prerequests-ts.mdx b/snippets/prerequests-ts.mdx new file mode 100644 index 000000000..56ab7d57f --- /dev/null +++ b/snippets/prerequests-ts.mdx @@ -0,0 +1,30 @@ +#### Prerequisites + +- Obtain your `LANGWATCH_API_KEY` from the [LangWatch dashboard](https://app.langwatch.com/). + +#### Installation + +```sh +npm install langwatch +``` + +#### Configuration + +Ensure `LANGWATCH_API_KEY` is set: + + + +```bash .env +LANGWATCH_API_KEY='your_api_key_here' +``` + + +```typescript +import { LangWatch } from 'langwatch'; + +const langwatch = new LangWatch({ + apiKey: 'your_api_key_here', +}); +``` + + diff --git a/snippets/prerequests.mdx b/snippets/prerequests.mdx new file mode 100644 index 000000000..5874eb6e8 --- /dev/null +++ b/snippets/prerequests.mdx @@ -0,0 +1,42 @@ +#### Prerequisites + +- Obtain your `LANGWATCH_API_KEY` from the [LangWatch dashboard](https://app.langwatch.com/). + +#### Installation + +```sh +pip install langwatch +``` + +#### Configuration + +Ensure `LANGWATCH_API_KEY` is set: + + + +```bash +export LANGWATCH_API_KEY='your_api_key_here' +``` + + +You can set `LANGWATCH_API_KEY` globally at runtime: + +```python +import langwatch +import os + +langwatch.api_key = os.getenv("LANGWATCH_API_KEY") +``` + +Or on the specific trace being tracked: + +```python +import langwatch +import os + +@langwatch.trace(api_key=os.getenv("LANGWATCH_API_KEY")) +def main(): + ... +``` + + diff --git a/snippets/python-custom-evaluation.mdx b/snippets/python-custom-evaluation.mdx new file mode 100644 index 000000000..8e3753f25 --- /dev/null +++ b/snippets/python-custom-evaluation.mdx @@ -0,0 +1,17 @@ +```python +import langwatch + +@langwatch.span(type="evaluation") +def evaluation_step(): + ... # your custom evaluation logic + + langwatch.get_current_span().add_evaluation( + name="custom evaluation", # required + passed=True, + score=0.5, + label="category_detected", + details="explanation of the evaluation results", + ) +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. \ No newline at end of file diff --git a/snippets/python-langchain-rag.mdx b/snippets/python-langchain-rag.mdx new file mode 100644 index 000000000..a42bfc4be --- /dev/null +++ b/snippets/python-langchain-rag.mdx @@ -0,0 +1,81 @@ +When using LangChain, generally your RAG happens by calling a [`Retriever`](https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/). + +We provide a utility `langwatch.langchain.capture_rag_from_retriever` to capture the documents found by the retriever and convert it into a LangWatch compatible format for tracking. For that you need to pass the retriever as first argument, and then a function to map each document to a `RAGChunk`, like in the example below: + +```python +import langwatch +from langwatch.types import RAGChunk + +@langwatch.trace() +def main(): + retriever = ... + retriever_tool = create_retriever_tool( + langwatch.langchain.capture_rag_from_retriever( + retriever, + lambda document: RAGChunk( + document_id=document.metadata["source"], + content=document.page_content + ), + ), + "langwatch_search", + "Search for information about LangWatch. For any questions about LangWatch, use this tool if you didn't already", + ) + + tools = [retriever_tool] + model = ChatOpenAI(streaming=True) + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis and use tools only once.\n\n{agent_scratchpad}", + ), + ("human", "{question}"), + ] + ) + agent = create_tool_calling_agent(model, tools, prompt) + executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + return executor.invoke(user_input, config=RunnableConfig( + callbacks=[langwatch.get_current_trace().get_langchain_callback()] + )) +``` + +Alternatively, if you don't use retrievers, but still want to capture the context for example from a tool call that you do, we also provide a utility `langwatch.langchain.capture_rag_from_tool` to capture RAG contexts around a tool. For that you need to pass the tool as first argument, and then a function to map the tool's output to `RAGChunk`s, like in the example below: + +```python +import langwatch +from langwatch.types import RAGChunk + +@langwatch.trace() +def main(): + my_custom_tool = ... + wrapped_tool = langwatch.langchain.capture_rag_from_tool( + my_custom_tool, lambda response: [ + RAGChunk( + document_id=response["id"], # optional + chunk_id=response["chunk_id"], # optional + content=response["content"] + ) + ] + ) + + tools = [wrapped_tool] # use the new wrapped tool in your agent instead of the original one + model = ChatOpenAI(streaming=True) + prompt = ChatPromptTemplate.from_messages( + [ + ( + "system", + "You are a helpful assistant that only reply in short tweet-like responses, using lots of emojis and use tools only once.\n\n{agent_scratchpad}", + ), + ("human", "{question}"), + ] + ) + agent = create_tool_calling_agent(model, tools, prompt) + executor = AgentExecutor(agent=agent, tools=tools, verbose=True) + return executor.invoke(user_input, config=RunnableConfig( + callbacks=[langWatchCallback] + )) +``` + +Then you'll be able to see the captured contexts that will also be used later on for evaluatios on LangWatch dashboard: + +![RAG Spans](/images/integration/langchain-rag.png) \ No newline at end of file diff --git a/snippets/python-rag-span.mdx b/snippets/python-rag-span.mdx new file mode 100644 index 000000000..fb977cedc --- /dev/null +++ b/snippets/python-rag-span.mdx @@ -0,0 +1,50 @@ +To capture a RAG span, you can use the `@langwatch.span(type="rag")` decorator, along with a call to `.update()` to add the `contexts` to the span: + +```python +@langwatch.span(type="rag") +def rag_retrieval(): + # the documents you retrieved from your vector database + search_results = ["France is a country in Europe.", "Paris is the capital of France."] + + # capture them on the span contexts before returning + langwatch.get_current_span().update(contexts=search_results) + + return search_results +``` + +If you have document or chunk ids from the results, we recommend you can to capture them along with the id using `RAGChunk`, as this allows them to be grouped together and generate documents analytics on LangWatch dashboard: + +```python +from langwatch.types import RAGChunk + +@langwatch.span(type="rag") +def rag_retrieval(): + # the documents you retrieved from your vector database + search_results = [ + { + "id": "doc-1", + "content": "France is a country in Europe.", + }, + { + "id": "doc-2", + "content": "Paris is the capital of France.", + }, + ] + + # capture then on the span contexts with RAGChunk before returning + langwatch.get_current_span().update( + contexts=[ + RAGChunk( + document_id=document["id"], + content=document["content"], + ) + for document in search_results + ] + ) + + return search_results +``` + +Then you'll be able to see the captured contexts that will also be used later on for evaluatios on LangWatch dashboard: + +![RAG Spans](/images/integration/rag.png) \ No newline at end of file diff --git a/snippets/typescript-custom-evaluation.mdx b/snippets/typescript-custom-evaluation.mdx new file mode 100644 index 000000000..abf2d28c3 --- /dev/null +++ b/snippets/typescript-custom-evaluation.mdx @@ -0,0 +1,19 @@ +```typescript +import { type LangWatchTrace } from "langwatch"; + +async function llmStep({ message, trace }: { message: string, trace: LangWatchTrace }): Promise { + const span = trace.startLLMSpan({ name: "llmStep" }); + + // ... your existing code + + span.addEvaluation({ + name: "custom evaluation", + passed: true, + score: 0.5, + label: "category_detected", + details: "explanation of the evaluation results", + }); +} +``` + +The evaluation `name` is required and must be a string. The other fields are optional, but at least one of `passed`, `score` or `label` must be provided. \ No newline at end of file diff --git a/snippets/typescript-rag.mdx b/snippets/typescript-rag.mdx new file mode 100644 index 000000000..f22218ffb --- /dev/null +++ b/snippets/typescript-rag.mdx @@ -0,0 +1,31 @@ +To capture a RAG, you can simply start a RAG span inside the trace, giving it the input query being used: + +```typescript +const ragSpan = trace.startRAGSpan({ + name: "my-vectordb-retrieval", // optional + input: { type: "text", value: "search query" }, +}); + +// proceed to do the retrieval normally +``` + +Then, after doing the retrieval, you can end the RAG span with the contexts that were retrieved and will be used by the LLM: + +```typescript +ragSpan.end({ + contexts: [ + { + documentId: "doc1", + content: "document chunk 1", + }, + { + documentId: "doc2", + content: "document chunk 2", + }, + ], +}); +``` + + +On LangChain.js, RAG spans are captured automatically by the LangWatch callback when using LangChain Retrievers, with `source` as the documentId. + \ No newline at end of file diff --git a/status.mdx b/status.mdx new file mode 100644 index 000000000..97201bfb7 --- /dev/null +++ b/status.mdx @@ -0,0 +1,5 @@ +--- +title: Status Page +--- + +Visit the [status page](https://status.langwatch.ai) to see the current status of LangWatch and any ongoing maintenance or incidents. \ No newline at end of file diff --git a/style.css b/style.css new file mode 100644 index 000000000..0150bb052 --- /dev/null +++ b/style.css @@ -0,0 +1,7 @@ +.tabs .overflow-x-auto { + overflow-x: visible !important; +} + +a[href="https://github.com/langwatch/langwatch"] .text-sm { + white-space: nowrap; +} diff --git a/support.mdx b/support.mdx new file mode 100644 index 000000000..5e49d12dc --- /dev/null +++ b/support.mdx @@ -0,0 +1,66 @@ +--- +title: Troubleshooting and Support +--- + +While using LangWatch, you may encounter issues or have questions that need further assistance. This page outlines the steps you can take to troubleshoot problems and how to reach out for support when you need it. + +## Troubleshooting Steps + +Before reaching out for support, here are a few steps you can take to resolve common issues: + +1. **Check Environment Variables**: Ensure that your `LANGWATCH_API_KEY` is correctly set in your environment. This is a common issue that can prevent LangWatch from functioning properly. + +2. **Review Integration Code**: Go through your integration code to ensure that you've followed the steps outlined in the [Getting Started](#) documentation. Sometimes, issues arise from a missed step or incorrect implementation. + +3. **Consult the Documentation**: Look through the documentation for sections relevant to the issue you're facing. There might be specific notes or sections that address your problem. + +4. **Update LangWatch**: Ensure that you are using the latest version of the `langwatch` library. Outdated versions may contain bugs that have been resolved in newer releases. + + You can update the library using pip: + + ```sh + pip install --upgrade langwatch + ``` + + Or if you are using typescript: + + ```sh + npm install langwatch@latest + ``` + +5. **Examine Error Messages**: If there are error messages being returned from LangWatch or your LLM provider, inspect them closely as they often provide clues to the root cause. + +## Opening an Issue + +If the troubleshooting steps do not resolve your issue, you can open an issue on our GitHub repository: + +[https://github.com/langwatch/langwatch/issues](https://github.com/langwatch/langwatch/issues) + +When opening an issue, please include the following: + +- A clear and descriptive title. +- A detailed description of the issue. Include any error messages you're seeing, and steps to reproduce the problem if possible. +- The version of the `langwatch` library you're using. +- Any relevant snippets of your code (avoid sharing your API keys or sensitive information). + +Our community and maintainers will look into the issue and work with you to find a solution. + +## Email Support + +If you are prefer direct assistance or have inquiries that require privacy, you can reach out to our support team via email at support@langwatch.ai. + +When writing to support, please provide: + +- A summary of the issue or your question. +- Details about your environment, such as the LLM provider and language you're using. +- Any relevant logs or error messages (with sensitive information redacted). + +Our support team aims to respond to inquiries as quickly as possible, typically within one business day. + +## Discord Channel + +You can also join our [Discord](https://discord.gg/kT4PhDS2gH) channel and ask questions directly for the community and the core team. + +## We're Here to Help + +At LangWatch, we're committed to ensuring that your experience is as smooth and beneficial as possible. Whether it's a technical hiccup or a general question, our team is here to support your journey in harnessing the full potential of LLMs with our platform. diff --git a/user-events/custom.mdx b/user-events/custom.mdx new file mode 100644 index 000000000..3527208e9 --- /dev/null +++ b/user-events/custom.mdx @@ -0,0 +1,59 @@ +--- +title: Custom Events +--- + +Appart from the reserved pre-defined events, you can also define your own events revelant to your business and be captured to correlate with your LLM messages and threads to measure your product performance. + +Custom events allow you to track any user interevents with your LLM application by sending numeric metrics and capturing additional details about the event. You can defined any name for the event on `event_type` field, and any metric names you want on `metrics` with numeric values, plus any extra details you want to capture on `event_details` with string values, just keep them consistent to be able to visualize on the dashboard, where you can customize the display later on. + +## REST API Specification + +### Endpoint + +`POST /api/track_event` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the event ocurred", + "event_type": "your_custom_event_type", + "metrics": { + "your_metric_key": 123 // Any numeric metric + }, + "event_details": { + "your_detail_key": "Any string detail" + }, + "timestamp": 1617981376000 // Unix timestamp in milliseconds +} +``` + +### Example + +```bash +curl -X POST "https://app.langwatch.ai/api/track_event" \\ + -H "X-Auth-Token: your_api_key" \\ + -H "Content-Type: application/json" \\ + -d '{ + "trace_id": "trace_Yy0XWu6BOwwnrkLtQh9Ji", + "event_type": "add_to_cart", + "metrics": { + "amount": 17.5 + }, + "event_details": { + "product_id": "sku_123", + "referral_source": "bot_suggested" + }, + "timestamp": 1617981376000 + }' +``` + +You can send any event type with corresponding numeric metrics and string details. This flexibility allows you to tailor event tracking to your specific needs. + +On the dashboard, you can visualize the tracked events on the "Events" tab when opening the trace details. + +Custom Events details table \ No newline at end of file diff --git a/user-events/overview.mdx b/user-events/overview.mdx new file mode 100644 index 000000000..cfd09c9d3 --- /dev/null +++ b/user-events/overview.mdx @@ -0,0 +1,8 @@ +Learn how to track user interactions with your LLM applications using the LangWatch REST API. This section provides detailed guides for predefined events such as thumbs up/down, text selection, and waiting times, as well as instructions for custom event tracking. + + + + + + + diff --git a/user-events/selected-text.mdx b/user-events/selected-text.mdx new file mode 100644 index 000000000..f89154457 --- /dev/null +++ b/user-events/selected-text.mdx @@ -0,0 +1,52 @@ +--- +title: Selected Text Events +--- + +Selected text events track when a user selects text generated by your LLM application, indicating the response was useful enough to be copied and used elsewhere. + +## REST API Specification + +### Endpoint + +`POST /api/track_event` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the user selected", + "event_type": "selected_text", + "metrics": { + "text_length": 120 // Length of the selected text in characters + }, + "event_details": { + "selected_text": "The selected text content" + }, + "timestamp": 1617981376000, // Unix timestamp in milliseconds +} +``` + +### Example + +```bash +curl -X POST "https://app.langwatch.ai/api/track_event" \\ + -H "X-Auth-Token: your_api_key" \\ + -H "Content-Type: application/json" \\ + -d '{ + "trace_id": "trace_Yy0XWu6BOwwnrkLtQh9Ji", + "event_type": "selected_text", + "metrics": { + "text_length": 120 + }, + "event_details": { + "selected_text": "The capital of France is Paris." + }, + "timestamp": 1617981376000 + }' +``` + +The `text_length` metric is mandatory and should reflect the length of the selected text. The `selected_text` field in `event_details` is optional if you also want to capture the actual text that was selected by the user. diff --git a/user-events/thumbs-up-down.mdx b/user-events/thumbs-up-down.mdx new file mode 100644 index 000000000..ddd25a8aa --- /dev/null +++ b/user-events/thumbs-up-down.mdx @@ -0,0 +1,56 @@ +--- +title: Thumbs Up/Down +--- + +Thumbs up/down events are used to capture user feedback on specific messages or interactions with your chatbot or LLM application, with an optional textual feedback. + +You can use those user provided inputs in combination with the automatic sentiment analysis provided by LangWatch to gauge how satisfied your users are with the generated responses, and use this information to get insights, debug, iterate and improve your product. + +To use the thumbs_up_down event it's important that you have used an explicit `trace_id` defined on your side when doing the integration. Read more about it on [concepts](../concepts). + +## REST API Specification + +### Endpoint + +`POST /api/track_event` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the user gave the feedback on", + "event_type": "thumbs_up_down", + "metrics": { + "vote": 1 // Use 1 for thumbs up, 0 for neutral or undo feedback, and -1 for thumbs down + }, + "event_details": { + "feedback": "Optional user feedback text" + }, + "timestamp": 1617981376000 // Unix timestamp in milliseconds +} +``` + +### Example + +```bash +curl -X POST "https://app.langwatch.ai/api/track_event" \\ + -H "X-Auth-Token: your_api_key" \\ + -H "Content-Type: application/json" \\ + -d '{ + "trace_id": "trace_Yy0XWu6BOwwnrkLtQh9Ji", + "event_type": "thumbs_up_down", + "metrics": { + "vote": 1 + }, + "event_details": { + "feedback": "This response was helpful!" + }, + "timestamp": 1617981376000 + }' +``` + +The `vote` metric is mandatory and must be either `1` or `-1`. The `feedback` field in `event_details` is optional and can be used to provide additional context or comments from the user. diff --git a/user-events/waited-to-finish.mdx b/user-events/waited-to-finish.mdx new file mode 100644 index 000000000..171aafca8 --- /dev/null +++ b/user-events/waited-to-finish.mdx @@ -0,0 +1,58 @@ +--- +title: Waited To Finish Events +--- + +Waited to finish events are used to determine if users are waiting for the LLM application to finish generating a response or if they leave before it's completed, this is interesting to capture using impatience with regards to the response generation. + +Since the user can simply close the window, to be able to track it, we need to send two requests, first one with `finished` set as `0`, to identify the output has started, and the another one with `finished` set as `1` when the output finishes at client side. If `"finished": 1` is never received, we assume the user didn't let the AI finish. + +## REST API Specification + +### Endpoint + +`POST /api/track_event` + +### Headers + +- `X-Auth-Token`: Your LangWatch API key. + +### Request Body + +```javascript +{ + "trace_id": "id of the message the user gave the feedback on", + "event_type": "waited_to_finish", + "metrics": { + "finished": 0 // Call it with 0 on the first request, then with 1 after the messages finishes rendering + }, + "timestamp": 1617981376000 // Unix timestamp in milliseconds +} +``` + +### Example + +```bash +curl -X POST "https://app.langwatch.ai/api/track_event" \\ + -H "X-Auth-Token: your_api_key" \\ + -H "Content-Type: application/json" \\ + -d '{ + "trace_id": "trace_Yy0XWu6BOwwnrkLtQh9Ji", + "event_type": "waited_to_finish", + "metrics": { + "finished": 0 + }, + "timestamp": 1617981376000 + }' + +curl -X POST "https://app.langwatch.ai/api/track_event" \\ + -H "X-Auth-Token: your_api_key" \\ + -H "Content-Type: application/json" \\ + -d '{ + "trace_id": "trace_Yy0XWu6BOwwnrkLtQh9Ji", + "event_type": "waited_to_finish", + "metrics": { + "finished": 1 + }, + "timestamp": 1617981378000 + }' +```