From 9a7fda01c2fd46b2538b78d5b588c5ae0c33053d Mon Sep 17 00:00:00 2001
From: Kevin Mas Ruiz <kevin.mas@hey.com>
Date: Fri, 24 Oct 2025 15:00:51 +0200
Subject: [PATCH 1/3] chore: add accuracy tests for filtering and improve
 prompt descriptions

---
 src/tools/mongodb/read/aggregate.ts |  22 ++-
 tests/accuracy/aggregate.test.ts    | 255 +++++++++++++++++++++++++---
 tests/accuracy/sdk/matcher.ts       |  46 +++++
 3 files changed, 293 insertions(+), 30 deletions(-)
diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts
index a5ff12381..d557d9d5f 100644
--- a/src/tools/mongodb/read/aggregate.ts
+++ b/src/tools/mongodb/read/aggregate.ts
@@ -47,7 +47,7 @@ const VectorSearchStage = z.object({
             filter: zEJSON()
                 .optional()
                 .describe(
-                    "MQL filter that can only use pre-filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for pre-filtering."
+                    "MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering."
                 ),
             embeddingParameters: zSupportedEmbeddingParameters
                 .optional()
@@ -59,11 +59,21 @@ const VectorSearchStage = z.object({
 });
 
 export const AggregateArgs = {
-    pipeline: z
-        .array(z.union([AnyStage, VectorSearchStage]))
-        .describe(
-            "An array of aggregation stages to execute. $vectorSearch can only appear as the first stage of the aggregation pipeline or as the first stage of a $unionWith subpipeline. When using $vectorSearch, unless the user explicitly asks for the embeddings, $unset any embedding field to avoid reaching context limits."
-        ),
+    pipeline: z.array(z.union([AnyStage, VectorSearchStage])).describe(
+        `An array of aggregation stages to execute.  
+\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
+### Usage Rules for \`$vectorSearch\`
+- **Unset embeddings:**  
+  Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
+- **Prefiltering:**
+If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
+    NEVER include fields in $vectorSearch.filter that are not part of the vector index.
+- **Post-filtering:**
+    For all remaining filters, add a $match stage after $vectorSearch.
+### Note to LLM
+- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
+- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.`
+    ),
     responseBytesLimit: z.number().optional().default(ONE_MB).describe(`\
 The maximum number of bytes to return in the response. This value is capped by the server's configured maxBytesPerQuery and cannot be exceeded. \
 Note to LLM: If the entire aggregation result is required, use the "export" tool instead of increasing this limit.\
diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts
index 85340a331..36bd7f9c1 100644
--- a/tests/accuracy/aggregate.test.ts
+++ b/tests/accuracy/aggregate.test.ts
@@ -2,6 +2,24 @@ import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js";
 import { Matcher } from "./sdk/matcher.js";
 import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
 
+function doesUnset(field: string): Matcher {
+    return Matcher.anyOf(
+        // { $unset: "<field>" } || { $unset: ["<field>"] }
+        Matcher.value({ $unset: Matcher.arrayOrSingle(Matcher.value(field)) }),
+        // { $unset: { "<field>": "" } }
+        Matcher.value({ $unset: { [field]: "" } })
+    );
+}
+
+const embeddingParameters = {
+    model: "voyage-3-large",
+    outputDimension: Matcher.anyOf(
+        Matcher.undefined,
+        Matcher.number((n) => n === 1024)
+    ),
+    outputDType: Matcher.anyOf(Matcher.undefined, Matcher.value("float")),
+};
+
 describeAccuracyTests([
     {
         prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
@@ -48,16 +66,71 @@ describeAccuracyTests([
                                 index: "titles",
                                 path: "title_embeddings",
                                 queryVector: "hammer of justice",
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
+                                embeddingParameters,
+                                filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                            },
+                        },
+                        doesUnset("title_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "titles",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    type: "vector",
+                                    path: "title_embeddings",
+                                    numDimensions: 1024,
+                                    quantization: "none",
+                                    similarity: "euclidean",
                                 },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
+    {
+        prompt: "Run a vectorSearch query on musicfy.songs on path 'title_embeddings' using the index 'titles' with the model voyage-3-large to find all 'hammer of justice' songs. Keep the embedding field, do not remove it.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "musicfy",
+                    collection: "songs",
+                },
+                optional: true,
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "musicfy",
+                    collection: "songs",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "titles",
+                                path: "title_embeddings",
+                                queryVector: "hammer of justice",
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        Matcher.not(doesUnset("title_embeddings")),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -107,21 +180,16 @@ describeAccuracyTests([
                     pipeline: [
                         {
                             $vectorSearch: {
-                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(true)),
+                                exact: true,
                                 index: "titles",
                                 path: "title_embeddings",
                                 queryVector: "hammer of justice",
-                                limit: 10,
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
-                                },
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        doesUnset("title_embeddings"),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -153,7 +221,7 @@ describeAccuracyTests([
         },
     },
     {
-        prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fy' movies.",
+        prompt: "Run an approximate vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies.",
         expectedToolCalls: [
             {
                 toolName: "collection-indexes",
@@ -173,17 +241,13 @@ describeAccuracyTests([
                                 exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
                                 index: "my-index",
                                 path: "plot_embeddings",
-                                queryVector: "sci-fy",
-                                embeddingParameters: {
-                                    model: "voyage-3-large",
-                                    outputDimension: Matcher.anyOf(
-                                        Matcher.undefined,
-                                        Matcher.number((n) => n === 1024)
-                                    ),
-                                },
+                                queryVector: "sci-fi",
+                                embeddingParameters,
                                 filter: Matcher.emptyObjectOrUndefined,
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                             },
                         },
+                        doesUnset("plot_embeddings"),
                     ],
                     responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
                 },
@@ -214,4 +278,147 @@ describeAccuracyTests([
             },
         },
     },
+    {
+        prompt: "(Pre-filter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with the `released` after 1993 (included) and are published in catalan.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "my-index",
+                                path: "plot_embeddings",
+                                queryVector: "sci-fi",
+                                numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                embeddingParameters,
+                                filter: {
+                                    released: { $gte: 1993 },
+                                    language: Matcher.caseInsensitiveString("catalan"),
+                                },
+                            },
+                        },
+                        doesUnset("plot_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "my-index",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    fields: [
+                                        {
+                                            type: "vector",
+                                            path: "plot_embeddings",
+                                            numDimensions: 1024,
+                                            quantization: "none",
+                                            similarity: "euclidean",
+                                        },
+                                        {
+                                            type: "filter",
+                                            path: "language",
+                                        },
+                                        {
+                                            type: "filter",
+                                            path: "released",
+                                        },
+                                    ],
+                                },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
+    {
+        prompt: "(No-prefilter) Run a vectorSearch query on mflix.movies on path 'plot_embeddings' with the model voyage-3-large to find all 'sci-fi' movies. I only want movies with `released` after 1993 (included) and are published in catalan.",
+        expectedToolCalls: [
+            {
+                toolName: "collection-indexes",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                },
+            },
+            {
+                toolName: "aggregate",
+                parameters: {
+                    database: "mflix",
+                    collection: "movies",
+                    pipeline: [
+                        {
+                            $vectorSearch: {
+                                exact: Matcher.anyOf(Matcher.undefined, Matcher.boolean(false)),
+                                index: "my-index",
+                                path: "plot_embeddings",
+                                queryVector: "sci-fi",
+                                numCandidates: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                limit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                                embeddingParameters,
+                                filter: Matcher.emptyObjectOrUndefined,
+                            },
+                        },
+                        {
+                            $match: {
+                                released: { $gte: 1993 },
+                                language: Matcher.caseInsensitiveString("catalan"),
+                            },
+                        },
+                        doesUnset("plot_embeddings"),
+                    ],
+                    responseBytesLimit: Matcher.anyOf(Matcher.number(), Matcher.undefined),
+                },
+            },
+        ],
+        mockedTools: {
+            "collection-indexes": (): CallToolResult => {
+                return {
+                    content: [
+                        {
+                            type: "text",
+                            text: JSON.stringify({
+                                name: "my-index",
+                                type: "vectorSearch",
+                                status: "READY",
+                                queryable: true,
+                                latestDefinition: {
+                                    fields: [
+                                        {
+                                            type: "vector",
+                                            path: "plot_embeddings",
+                                            numDimensions: 1024,
+                                            quantization: "none",
+                                            similarity: "euclidean",
+                                        },
+                                    ],
+                                },
+                            }),
+                        },
+                    ],
+                };
+            },
+        },
+    },
 ]);
diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts
index 535b03a57..d25836814 100644
--- a/tests/accuracy/sdk/matcher.ts
+++ b/tests/accuracy/sdk/matcher.ts
@@ -32,6 +32,18 @@ export abstract class Matcher {
         return new StringMatcher();
     }
 
+    public static caseInsensitiveString(text: string): Matcher {
+        return new CaseInsensitiveStringMatcher(text);
+    }
+
+    public static not(matcher: Matcher): Matcher {
+        return new NotMatcher(matcher);
+    }
+
+    public static arrayOrSingle(matcher: Matcher): Matcher {
+        return new ArrayOrSingleValueMatching(matcher);
+    }
+
     public static value(expected: unknown): Matcher {
         if (typeof expected === "object" && expected !== null && MATCHER_SYMBOL in expected) {
             return expected as Matcher;
@@ -61,6 +73,20 @@ class AnyValueMatcher extends Matcher {
     }
 }
 
+class ArrayOrSingleValueMatching extends Matcher {
+    constructor(private matcher: Matcher) {
+        super();
+    }
+
+    public match(other: unknown): number {
+        if (Array.isArray(other)) {
+            return other.length === 1 && this.matcher.match(other[0]) === 1 ? 1 : 0;
+        }
+
+        return this.matcher.match(other);
+    }
+}
+
 class NumberMatcher extends Matcher {
     constructor(private additionalFilter: (value: number) => boolean = () => true) {
         super();
@@ -76,6 +102,16 @@ class UndefinedMatcher extends Matcher {
     }
 }
 
+class NotMatcher extends Matcher {
+    constructor(private matcher: Matcher) {
+        super();
+    }
+
+    public match(actual: unknown): number {
+        return this.matcher.match(actual) === 1 ? 0 : 1;
+    }
+}
+
 class CompositeMatcher extends Matcher {
     constructor(private matchers: Matcher[]) {
         super();
@@ -112,6 +148,16 @@ class StringMatcher extends Matcher {
     }
 }
 
+class CaseInsensitiveStringMatcher extends Matcher {
+    constructor(private expected: string) {
+        super();
+    }
+
+    public match(actual: unknown): number {
+        return typeof actual === "string" && this.expected.toLocaleLowerCase() === actual.toLocaleLowerCase() ? 1 : 0;
+    }
+}
+
 class ValueMatcher extends Matcher {
     constructor(private expected: unknown) {
         super();

From d42ec2f7a046a468893452c63c9cb291e940924e Mon Sep 17 00:00:00 2001
From: Kevin Mas Ruiz <kevin.mas@hey.com>
Date: Fri, 24 Oct 2025 15:14:45 +0200
Subject: [PATCH 2/3] chore: fix metadata test

---
 .../tools/mongodb/read/aggregate.test.ts          | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts
index d71ab4d91..2dcc89e47 100644
--- a/tests/integration/tools/mongodb/read/aggregate.test.ts
+++ b/tests/integration/tools/mongodb/read/aggregate.test.ts
@@ -27,8 +27,19 @@ describeWithMongoDB("aggregate tool", (integration) => {
         ...databaseCollectionParameters,
         {
             name: "pipeline",
-            description:
-                "An array of aggregation stages to execute. $vectorSearch can only appear as the first stage of the aggregation pipeline or as the first stage of a $unionWith subpipeline. When using $vectorSearch, unless the user explicitly asks for the embeddings, $unset any embedding field to avoid reaching context limits.",
+            description: `An array of aggregation stages to execute.  
+\`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline.
+### Usage Rules for \`$vectorSearch\`
+- **Unset embeddings:**  
+  Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
+- **Prefiltering:**
+If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
+    NEVER include fields in $vectorSearch.filter that are not part of the vector index.
+- **Post-filtering:**
+    For all remaining filters, add a $match stage after $vectorSearch.
+### Note to LLM
+- If unsure which fields are filterable, use the collection-indexes tool to determine valid prefilter fields.
+- If no requested filters are valid prefilters, omit the filter key from $vectorSearch.`,
             type: "array",
             required: true,
         },

From 132727b1cc235c8dd5ae466c4d103678c9dddf8d Mon Sep 17 00:00:00 2001
From: Kevin Mas Ruiz <kevin.mas@hey.com>
Date: Fri, 24 Oct 2025 16:36:16 +0200
Subject: [PATCH 3/3] chore: ensure that extra parameters are discarded

Voyage will reject all requests with extra parameters
---
 src/common/search/embeddingsProvider.ts       | 29 +++++++++++++++++--
 src/tools/mongodb/read/aggregate.ts           |  2 +-
 .../tools/mongodb/read/aggregate.test.ts      |  2 +-
 .../vectorSearchEmbeddingsManager.test.ts     |  2 +-
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/common/search/embeddingsProvider.ts b/src/common/search/embeddingsProvider.ts
index efc93e436..f81537312 100644
--- a/src/common/search/embeddingsProvider.ts
+++ b/src/common/search/embeddingsProvider.ts
@@ -27,14 +27,33 @@ export const zVoyageModels = z
     .enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"])
     .default("voyage-3-large");
 
+// Zod does not undestand JS boxed numbers (like Int32) as integer literals,
+// so we preprocess them to unwrap them so Zod understands them.
+function unboxNumber(v: unknown): number {
+    if (v && typeof v === "object" && typeof v.valueOf === "function") {
+        const n = Number(v.valueOf());
+        if (!Number.isNaN(n)) return n;
+    }
+    return v as number;
+}
+
 export const zVoyageEmbeddingParameters = z.object({
     outputDimension: z
-        .union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
+        .preprocess(
+            unboxNumber,
+            z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)])
+        )
         .optional()
         .default(1024),
-    outputDType: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
+    outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"),
 });
 
+const zVoyageAPIParameters = zVoyageEmbeddingParameters
+    .extend({
+        inputType: z.enum(["query", "document"]),
+    })
+    .strip();
+
 type VoyageModels = z.infer<typeof zVoyageModels>;
 type VoyageEmbeddingParameters = z.infer<typeof zVoyageEmbeddingParameters> & EmbeddingParameters;
 
@@ -62,11 +81,15 @@ class VoyageEmbeddingsProvider implements EmbeddingsProvider<VoyageModels, Voyag
         content: EmbeddingsInput[],
         parameters: VoyageEmbeddingParameters
     ): Promise<Embeddings[]> {
+        // This ensures that if we receive any random parameter from the outside (agent or us)
+        // it's stripped before sending it to Voyage, as Voyage will reject the request on
+        // a single unknown parameter.
+        const voyage = zVoyageAPIParameters.parse(parameters);
         const model = this.voyage.textEmbeddingModel(modelId);
         const { embeddings } = await embedMany({
             model,
             values: content,
-            providerOptions: { voyage: parameters },
+            providerOptions: { voyage },
         });
 
         return embeddings;
diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts
index d557d9d5f..c2ee5af3f 100644
--- a/src/tools/mongodb/read/aggregate.ts
+++ b/src/tools/mongodb/read/aggregate.ts
@@ -65,7 +65,7 @@ export const AggregateArgs = {
 ### Usage Rules for \`$vectorSearch\`
 - **Unset embeddings:**  
   Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
-- **Prefiltering:**
+- **Pre-filtering:**
 If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
     NEVER include fields in $vectorSearch.filter that are not part of the vector index.
 - **Post-filtering:**
diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts
index 2dcc89e47..4c5963a63 100644
--- a/tests/integration/tools/mongodb/read/aggregate.test.ts
+++ b/tests/integration/tools/mongodb/read/aggregate.test.ts
@@ -32,7 +32,7 @@ describeWithMongoDB("aggregate tool", (integration) => {
 ### Usage Rules for \`$vectorSearch\`
 - **Unset embeddings:**  
   Unless the user explicitly requests the embeddings, add an \`$unset\` stage **at the end of the pipeline** to remove the embedding field and avoid context limits. **The $unset stage in this situation is mandatory**.
-- **Prefiltering:**
+- **Pre-filtering:**
 If the user requests additional filtering, include filters in \`$vectorSearch.filter\` only for pre-filter fields in the vector index.
     NEVER include fields in $vectorSearch.filter that are not part of the vector index.
 - **Post-filtering:**
diff --git a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts
index fe5e23c61..24b921e72 100644
--- a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts
+++ b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts
@@ -396,7 +396,7 @@ describe("VectorSearchEmbeddingsManager", () => {
             collection: "mycoll",
             path: "embedding_field",
             rawValues: ["oops"],
-            embeddingParameters: { model: "voyage-3-large", outputDimension: 1024, outputDType: "float" } as const,
+            embeddingParameters: { model: "voyage-3-large", outputDimension: 1024, outputDtype: "float" } as const,
             inputType: "query" as const,
         };