From e77cf2ef2055dc2831fb10508833468c828ba2b1 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 27 Oct 2025 20:36:17 +0100 Subject: [PATCH 01/11] chore: add validation for vector stage pre-filter --- src/common/logger.ts | 1 + .../collectFieldsFromVectorSearchFilter.ts | 22 + src/tools/mongodb/read/aggregate.ts | 70 ++ .../tools/mongodb/read/aggregate.test.ts | 616 +++++++++++------- ...ollectFieldsFromVectorSearchFilter.test.ts | 171 +++++ 5 files changed, 633 insertions(+), 247 deletions(-) create mode 100644 src/helpers/collectFieldsFromVectorSearchFilter.ts create mode 100644 tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts diff --git a/src/common/logger.ts b/src/common/logger.ts index 100191f91..ec2343e3b 100644 --- a/src/common/logger.ts +++ b/src/common/logger.ts @@ -49,6 +49,7 @@ export const LogId = { toolUpdateFailure: mongoLogId(1_005_001), resourceUpdateFailure: mongoLogId(1_005_002), updateToolMetadata: mongoLogId(1_005_003), + toolValidationError: mongoLogId(1_005_004), streamableHttpTransportStarted: mongoLogId(1_006_001), streamableHttpTransportSessionCloseFailure: mongoLogId(1_006_002), diff --git a/src/helpers/collectFieldsFromVectorSearchFilter.ts b/src/helpers/collectFieldsFromVectorSearchFilter.ts new file mode 100644 index 000000000..2862d4b69 --- /dev/null +++ b/src/helpers/collectFieldsFromVectorSearchFilter.ts @@ -0,0 +1,22 @@ +// Based on - +// https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#mongodb-vector-search-pre-filter +const ALLOWED_LOGICAL_OPERATORS = ["$not", "$nor", "$and", "$or"]; + +export function collectFieldsFromVectorSearchFilter(filter: unknown): string[] { + if (!filter || typeof filter !== "object" || !Object.keys(filter).length) { + return []; + } + + const collectedFields = Object.entries(filter).reduce((collectedFields, [maybeField, fieldMQL]) => { + if (ALLOWED_LOGICAL_OPERATORS.includes(maybeField) && Array.isArray(fieldMQL)) { + return fieldMQL.flatMap((mql) => collectFieldsFromVectorSearchFilter(mql)); + } + + if (!ALLOWED_LOGICAL_OPERATORS.includes(maybeField)) { + collectedFields.push(maybeField); + } + return collectedFields; + }, []); + + return Array.from(new Set(collectedFields)); +} diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index c2ee5af3f..fe26784f4 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -14,6 +14,7 @@ import { AGG_COUNT_MAX_TIME_MS_CAP, ONE_MB, CURSOR_LIMITS_TO_LLM_TEXT } from ".. import { zEJSON } from "../../args.js"; import { LogId } from "../../../common/logger.js"; import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; +import { collectFieldsFromVectorSearchFilter } from "../../../helpers/collectFieldsFromVectorSearchFilter.js"; const AnyStage = zEJSON(); const VectorSearchStage = z.object({ @@ -97,6 +98,7 @@ export class AggregateTool extends MongoDBToolBase { try { const provider = await this.ensureConnected(); await this.assertOnlyUsesPermittedStages(pipeline); + await this.assertVectorSearchFilterFieldsAreIndexed(database, collection, pipeline); // Check if aggregate operation uses an index if enabled if (this.config.indexCheck) { @@ -202,6 +204,74 @@ export class AggregateTool extends MongoDBToolBase { } } + private async assertVectorSearchFilterFieldsAreIndexed( + database: string, + collection: string, + pipeline: Record[] + ): Promise { + if (!(await this.session.isSearchSupported())) { + return; + } + + const searchIndexesWithFilterFields = await this.searchIndexesWithFilterFields(database, collection); + for (const stage of pipeline) { + if ("$vectorSearch" in stage) { + const { $vectorSearch: vectorSearchStage } = stage as z.infer; + const allowedFilterFields = searchIndexesWithFilterFields[vectorSearchStage.index]; + if (!allowedFilterFields) { + this.session.logger.warning({ + id: LogId.toolValidationError, + context: "aggregate tool", + message: `Could not assert if filter fields are indexed - No filter fields found for index ${vectorSearchStage.index}`, + }); + return; + } + + const filterFieldsInStage = collectFieldsFromVectorSearchFilter(vectorSearchStage.filter); + const filterFieldsNotIndexed = filterFieldsInStage.filter( + (field) => !allowedFilterFields.includes(field) + ); + if (filterFieldsNotIndexed.length) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + `Vector search stage contains filter on fields are not indexed by index ${vectorSearchStage.index} - ${filterFieldsNotIndexed.join(", ")}` + ); + } + } + } + } + + private async searchIndexesWithFilterFields( + database: string, + collection: string + ): Promise> { + const searchIndexes = (await this.session.serviceProvider.getSearchIndexes(database, collection)) as Array<{ + name: string; + latestDefinition: { + fields: Array< + | { + type: "vector"; + } + | { + type: "filter"; + path: string; + } + >; + }; + }>; + + return searchIndexes.reduce>((indexFieldMap, searchIndex) => { + const filterFields = searchIndex.latestDefinition.fields + .map((field) => { + return field.type === "filter" ? field.path : undefined; + }) + .filter((filterField) => filterField !== undefined); + + indexFieldMap[searchIndex.name] = filterFields; + return indexFieldMap; + }, {}); + } + private async countAggregationResultDocuments({ provider, database, diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts index 5c923b571..843e066bb 100644 --- a/tests/integration/tools/mongodb/read/aggregate.test.ts +++ b/tests/integration/tools/mongodb/read/aggregate.test.ts @@ -401,283 +401,405 @@ import { DOCUMENT_EMBEDDINGS } from "./vyai/embeddings.js"; describeWithMongoDB( "aggregate tool with atlas search enabled", (integration) => { - beforeEach(async () => { + beforeEach(async ({ skip }) => { + skip(!process.env.TEST_MDB_MCP_VOYAGE_API_KEY); await integration.mongoClient().db(integration.randomDbName()).collection("databases").drop(); }); for (const [dataType, embedding] of Object.entries(DOCUMENT_EMBEDDINGS)) { for (const similarity of ["euclidean", "cosine", "dotProduct"]) { - describe.skipIf(!process.env.TEST_MDB_MCP_VOYAGE_API_KEY)( - `querying with dataType ${dataType} and similarity ${similarity}`, - () => { - it(`should be able to return elements from within a vector search query with data type ${dataType}`, async () => { - await waitUntilSearchIsReady(integration.mongoClient()); - - const collection = integration - .mongoClient() - .db(integration.randomDbName()) - .collection("databases"); - await collection.insertOne({ name: "mongodb", description_embedding: embedding }); - - await createVectorSearchIndexAndWait( - integration.mongoClient(), - integration.randomDbName(), - "databases", - [ + describe(`querying with dataType ${dataType} and similarity ${similarity}`, () => { + it(`should be able to return elements from within a vector search query with data type ${dataType}`, async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration + .mongoClient() + .db(integration.randomDbName()) + .collection("databases"); + await collection.insertOne({ name: "mongodb", description_embedding: embedding }); + + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity, + quantization: "none", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ { - type: "vector", - path: "description_embedding", - numDimensions: 256, - similarity, - quantization: "none", - }, - ] - ); - - // now query the index - await integration.connectMcpClient(); - const response = await integration.mcpClient().callTool({ - name: "aggregate", - arguments: { - database: integration.randomDbName(), - collection: "databases", - pipeline: [ - { - $vectorSearch: { - index: "default", - path: "description_embedding", - queryVector: embedding, - numCandidates: 10, - limit: 10, - embeddingParameters: { - model: "voyage-3-large", - outputDimension: 256, - outputDType: dataType, - }, + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: embedding, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: dataType, }, }, - { - $project: { - description_embedding: 0, - }, + }, + { + $project: { + description_embedding: 0, }, - ], - }, - }); - - const responseContent = getResponseContent(response); - expect(responseContent).toContain( - "The aggregation resulted in 1 documents. Returning 1 documents." - ); - const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); - expect(untrustedDocs).toHaveLength(1); - expect(untrustedDocs[0]?.name).toBe("mongodb"); + }, + ], + }, + }); + + const responseContent = getResponseContent(response); + expect(responseContent).toContain( + "The aggregation resulted in 1 documents. Returning 1 documents." + ); + const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); + expect(untrustedDocs).toHaveLength(1); + expect(untrustedDocs[0]?.name).toBe("mongodb"); + }); + + it("should be able to return elements from within a vector search query using binary encoding", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration + .mongoClient() + .db(integration.randomDbName()) + .collection("databases"); + await collection.insertOne({ + name: "mongodb", + description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), }); - it("should be able to return elements from within a vector search query using binary encoding", async () => { - await waitUntilSearchIsReady(integration.mongoClient()); - - const collection = integration - .mongoClient() - .db(integration.randomDbName()) - .collection("databases"); - await collection.insertOne({ - name: "mongodb", - description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), - }); - - await createVectorSearchIndexAndWait( - integration.mongoClient(), - integration.randomDbName(), - "databases", - [ + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity, + quantization: "none", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ { - type: "vector", - path: "description_embedding", - numDimensions: 256, - similarity, - quantization: "none", - }, - ] - ); - - // now query the index - await integration.connectMcpClient(); - const response = await integration.mcpClient().callTool({ - name: "aggregate", - arguments: { - database: integration.randomDbName(), - collection: "databases", - pipeline: [ - { - $vectorSearch: { - index: "default", - path: "description_embedding", - queryVector: embedding, - numCandidates: 10, - limit: 10, - embeddingParameters: { - model: "voyage-3-large", - outputDimension: 256, - outputDType: dataType, - }, + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: embedding, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: dataType, }, }, - { - $project: { - description_embedding: 0, - }, + }, + { + $project: { + description_embedding: 0, }, - ], - }, - }); - - const responseContent = getResponseContent(response); - expect(responseContent).toContain( - "The aggregation resulted in 1 documents. Returning 1 documents." - ); - const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); - expect(untrustedDocs).toHaveLength(1); - expect(untrustedDocs[0]?.name).toBe("mongodb"); + }, + ], + }, }); - it("should be able too return elements from within a vector search query using scalar quantization", async () => { - await waitUntilSearchIsReady(integration.mongoClient()); - - const collection = integration - .mongoClient() - .db(integration.randomDbName()) - .collection("databases"); - await collection.insertOne({ - name: "mongodb", - description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), - }); - - await createVectorSearchIndexAndWait( - integration.mongoClient(), - integration.randomDbName(), - "databases", - [ + const responseContent = getResponseContent(response); + expect(responseContent).toContain( + "The aggregation resulted in 1 documents. Returning 1 documents." + ); + const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); + expect(untrustedDocs).toHaveLength(1); + expect(untrustedDocs[0]?.name).toBe("mongodb"); + }); + + it("should be able too return elements from within a vector search query using scalar quantization", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration + .mongoClient() + .db(integration.randomDbName()) + .collection("databases"); + await collection.insertOne({ + name: "mongodb", + description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), + }); + + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity, + quantization: "scalar", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ { - type: "vector", - path: "description_embedding", - numDimensions: 256, - similarity, - quantization: "scalar", - }, - ] - ); - - // now query the index - await integration.connectMcpClient(); - const response = await integration.mcpClient().callTool({ - name: "aggregate", - arguments: { - database: integration.randomDbName(), - collection: "databases", - pipeline: [ - { - $vectorSearch: { - index: "default", - path: "description_embedding", - queryVector: embedding, - numCandidates: 10, - limit: 10, - embeddingParameters: { - model: "voyage-3-large", - outputDimension: 256, - outputDType: dataType, - }, + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: embedding, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: dataType, }, }, - { - $project: { - description_embedding: 0, - }, + }, + { + $project: { + description_embedding: 0, }, - ], - }, - }); - - const responseContent = getResponseContent(response); - expect(responseContent).toContain( - "The aggregation resulted in 1 documents. Returning 1 documents." - ); - const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); - expect(untrustedDocs).toHaveLength(1); - expect(untrustedDocs[0]?.name).toBe("mongodb"); + }, + ], + }, + }); + + const responseContent = getResponseContent(response); + expect(responseContent).toContain( + "The aggregation resulted in 1 documents. Returning 1 documents." + ); + const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); + expect(untrustedDocs).toHaveLength(1); + expect(untrustedDocs[0]?.name).toBe("mongodb"); + }); + + it("should be able too return elements from within a vector search query using binary quantization", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration + .mongoClient() + .db(integration.randomDbName()) + .collection("databases"); + await collection.insertOne({ + name: "mongodb", + description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), }); - it("should be able too return elements from within a vector search query using binary quantization", async () => { - await waitUntilSearchIsReady(integration.mongoClient()); - - const collection = integration - .mongoClient() - .db(integration.randomDbName()) - .collection("databases"); - await collection.insertOne({ - name: "mongodb", - description_embedding: BSON.Binary.fromFloat32Array(new Float32Array(embedding)), - }); - - await createVectorSearchIndexAndWait( - integration.mongoClient(), - integration.randomDbName(), - "databases", - [ + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity, + quantization: "binary", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ { - type: "vector", - path: "description_embedding", - numDimensions: 256, - similarity, - quantization: "binary", - }, - ] - ); - - // now query the index - await integration.connectMcpClient(); - const response = await integration.mcpClient().callTool({ - name: "aggregate", - arguments: { - database: integration.randomDbName(), - collection: "databases", - pipeline: [ - { - $vectorSearch: { - index: "default", - path: "description_embedding", - queryVector: embedding, - numCandidates: 10, - limit: 10, - embeddingParameters: { - model: "voyage-3-large", - outputDimension: 256, - outputDType: dataType, - }, + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: embedding, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: dataType, }, }, - { - $project: { - description_embedding: 0, - }, + }, + { + $project: { + description_embedding: 0, }, - ], - }, - }); - - const responseContent = getResponseContent(response); - expect(responseContent).toContain( - "The aggregation resulted in 1 documents. Returning 1 documents." - ); - const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); - expect(untrustedDocs).toHaveLength(1); - expect(untrustedDocs[0]?.name).toBe("mongodb"); + }, + ], + }, }); - } - ); + + const responseContent = getResponseContent(response); + expect(responseContent).toContain( + "The aggregation resulted in 1 documents. Returning 1 documents." + ); + const untrustedDocs = getDocsFromUntrustedContent<{ name: string }>(responseContent); + expect(untrustedDocs).toHaveLength(1); + expect(untrustedDocs[0]?.name).toBe("mongodb"); + }); + }); } } + + describe("when querying with a pre-filter", () => { + it("should fail the validation if the vector search index does not index any pre-filter fields", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration.mongoClient().db(integration.randomDbName()).collection("databases"); + await collection.insertOne({ name: "mongodb", description_embedding: DOCUMENT_EMBEDDINGS.float }); + + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity: "euclidean", + quantization: "none", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ + { + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: DOCUMENT_EMBEDDINGS.float, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: "float", + }, + filter: { name: 10 }, + }, + }, + { + $project: { + description_embedding: 0, + }, + }, + ], + }, + }); + + expect(response.isError).toBe(true); + expect(JSON.stringify(response.content)).toContain( + "Error running aggregate: Vector search stage contains filter on fields are not indexed by index default - name" + ); + }); + + it("should fail the validation if the pre-filter are not indexed as part of vector search index", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration.mongoClient().db(integration.randomDbName()).collection("databases"); + await collection.insertOne({ name: "mongodb", description_embedding: DOCUMENT_EMBEDDINGS.float }); + + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity: "euclidean", + quantization: "none", + }, + { + type: "filter", + path: "year", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ + { + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: DOCUMENT_EMBEDDINGS.float, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: "float", + }, + filter: { name: 10 }, + }, + }, + { + $project: { + description_embedding: 0, + }, + }, + ], + }, + }); + + expect(response.isError).toBe(true); + expect(JSON.stringify(response.content)).toContain( + "Error running aggregate: Vector search stage contains filter on fields are not indexed by index default - name" + ); + }); + }); }, { getUserConfig: () => ({ diff --git a/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts b/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts new file mode 100644 index 000000000..d4ce7f728 --- /dev/null +++ b/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts @@ -0,0 +1,171 @@ +import { describe, expect, it } from "vitest"; +import { collectFieldsFromVectorSearchFilter } from "../../../src/helpers/collectFieldsFromVectorSearchFilter.js"; + +describe("#collectFieldsFromVectorSearchFilter", () => { + it("should return empty list if filter is not an object or an empty object", () => { + expect(collectFieldsFromVectorSearchFilter(undefined)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(null)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(false)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(true)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(1)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(0)).toEqual([]); + expect(collectFieldsFromVectorSearchFilter("random")).toEqual([]); + expect(collectFieldsFromVectorSearchFilter({})).toEqual([]); + expect(collectFieldsFromVectorSearchFilter([])).toEqual([]); + expect(collectFieldsFromVectorSearchFilter(() => {})).toEqual([]); + }); + + it("should return fields from MQL that does not contain logical operators", () => { + expect( + collectFieldsFromVectorSearchFilter({ + field1: "MongoDB", + field2: { $eq: 1994 }, + field3: { $ne: "Horror" }, + field4: { $gt: 10 }, + field5: { $gt3: 10 }, + field6: { $lt: 10 }, + field7: { $lte: 10 }, + field8: { $in: [true, false] }, + field9: { $nin: [true, false] }, + field10: { $not: { $eq: 1994 } }, + }) + ).toEqual([ + "field1", + "field2", + "field3", + "field4", + "field5", + "field6", + "field7", + "field8", + "field9", + "field10", + ]); + }); + + it("should return fields from MQL built just with $and", () => { + expect( + collectFieldsFromVectorSearchFilter({ + $and: [ + { field1: "MongoDB" }, + { field2: { $eq: 1994 } }, + { field3: { $ne: "Horror" } }, + { field4: { $gt: 10 } }, + { field5: { $gt3: 10 } }, + { field6: { $lt: 10 } }, + { field7: { $lte: 10 } }, + { field8: { $in: [true, false] } }, + { field9: { $nin: [true, false] } }, + { field10: { $not: { $eq: 1994 } } }, + ], + }) + ).toEqual([ + "field1", + "field2", + "field3", + "field4", + "field5", + "field6", + "field7", + "field8", + "field9", + "field10", + ]); + }); + + it("should return fields from MQL built just with $or", () => { + expect( + collectFieldsFromVectorSearchFilter({ + $or: [ + { field1: "MongoDB" }, + { field2: { $eq: 1994 } }, + { field3: { $ne: "Horror" } }, + { field4: { $gt: 10 } }, + { field5: { $gt3: 10 } }, + { field6: { $lt: 10 } }, + { field7: { $lte: 10 } }, + { field8: { $in: [true, false] } }, + { field9: { $nin: [true, false] } }, + { field10: { $not: { $eq: 1994 } } }, + ], + }) + ).toEqual([ + "field1", + "field2", + "field3", + "field4", + "field5", + "field6", + "field7", + "field8", + "field9", + "field10", + ]); + }); + + it("should return fields from MQL built with nested $and / $or", () => { + expect( + collectFieldsFromVectorSearchFilter({ + $or: [ + { field1: "MongoDB" }, + { field2: { $eq: 1994 } }, + { field3: { $ne: "Horror" } }, + { field4: { $gt: 10 } }, + { field5: { $gt3: 10 } }, + { field6: { $lt: 10 } }, + { + $and: [ + { field7: { $lte: 10 } }, + { field8: { $in: [true, false] } }, + { field9: { $nin: [true, false] } }, + { field10: { $not: { $eq: 1994 } } }, + ], + }, + ], + }) + ).toEqual([ + "field1", + "field2", + "field3", + "field4", + "field5", + "field6", + "field7", + "field8", + "field9", + "field10", + ]); + + expect( + collectFieldsFromVectorSearchFilter({ + $and: [ + { field1: "MongoDB" }, + { field2: { $eq: 1994 } }, + { field3: { $ne: "Horror" } }, + { field4: { $gt: 10 } }, + { field5: { $gt3: 10 } }, + { field6: { $lt: 10 } }, + { + $or: [ + { field7: { $lte: 10 } }, + { field8: { $in: [true, false] } }, + { field9: { $nin: [true, false] } }, + { field10: { $not: { $eq: 1994 } } }, + ], + }, + ], + }) + ).toEqual([ + "field1", + "field2", + "field3", + "field4", + "field5", + "field6", + "field7", + "field8", + "field9", + "field10", + ]); + }); +}); From 66f69ab5a2eaca77b941b77ae3748b14b44dae0a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 27 Oct 2025 20:41:09 +0100 Subject: [PATCH 02/11] Copy fixes Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/tools/mongodb/read/aggregate.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index fe26784f4..bef3c78e9 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -234,7 +234,7 @@ export class AggregateTool extends MongoDBToolBase { if (filterFieldsNotIndexed.length) { throw new MongoDBError( ErrorCodes.AtlasVectorSearchInvalidQuery, - `Vector search stage contains filter on fields are not indexed by index ${vectorSearchStage.index} - ${filterFieldsNotIndexed.join(", ")}` + `Vector search stage contains filter on fields that are not indexed by index ${vectorSearchStage.index} - ${filterFieldsNotIndexed.join(", ")}` ); } } From 697a729a1543305f5d381c1eeae41a95cff9703c Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 27 Oct 2025 20:41:29 +0100 Subject: [PATCH 03/11] Update tests/integration/tools/mongodb/read/aggregate.test.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/tools/mongodb/read/aggregate.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts index 843e066bb..88b96406a 100644 --- a/tests/integration/tools/mongodb/read/aggregate.test.ts +++ b/tests/integration/tools/mongodb/read/aggregate.test.ts @@ -733,7 +733,7 @@ describeWithMongoDB( expect(response.isError).toBe(true); expect(JSON.stringify(response.content)).toContain( - "Error running aggregate: Vector search stage contains filter on fields are not indexed by index default - name" + "Error running aggregate: Vector search stage contains filter on fields that are not indexed by index default - name" ); }); From 998e6aad9a731900aae7c27d0671a90400319caa Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 27 Oct 2025 20:41:35 +0100 Subject: [PATCH 04/11] Update tests/integration/tools/mongodb/read/aggregate.test.ts Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/integration/tools/mongodb/read/aggregate.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts index 88b96406a..6876e189d 100644 --- a/tests/integration/tools/mongodb/read/aggregate.test.ts +++ b/tests/integration/tools/mongodb/read/aggregate.test.ts @@ -796,7 +796,7 @@ describeWithMongoDB( expect(response.isError).toBe(true); expect(JSON.stringify(response.content)).toContain( - "Error running aggregate: Vector search stage contains filter on fields are not indexed by index default - name" + "Error running aggregate: Vector search stage contains filter on fields that are not indexed by index default - name" ); }); }); From 92d5840cf0d94cb6f02d4c179e212c85844ccf63 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 27 Oct 2025 21:26:43 +0100 Subject: [PATCH 05/11] chore: another test case --- .../tools/mongodb/read/aggregate.test.ts | 63 +++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/tests/integration/tools/mongodb/read/aggregate.test.ts b/tests/integration/tools/mongodb/read/aggregate.test.ts index 6876e189d..e0e75b9a4 100644 --- a/tests/integration/tools/mongodb/read/aggregate.test.ts +++ b/tests/integration/tools/mongodb/read/aggregate.test.ts @@ -799,6 +799,69 @@ describeWithMongoDB( "Error running aggregate: Vector search stage contains filter on fields that are not indexed by index default - name" ); }); + + it("should succeed the validation if the pre-filter are also indexed as part of vector search index", async () => { + await waitUntilSearchIsReady(integration.mongoClient()); + + const collection = integration.mongoClient().db(integration.randomDbName()).collection("databases"); + await collection.insertOne({ name: "mongodb", description_embedding: DOCUMENT_EMBEDDINGS.float }); + + await createVectorSearchIndexAndWait( + integration.mongoClient(), + integration.randomDbName(), + "databases", + [ + { + type: "vector", + path: "description_embedding", + numDimensions: 256, + similarity: "euclidean", + quantization: "none", + }, + { + type: "filter", + path: "name", + }, + ] + ); + + // now query the index + await integration.connectMcpClient(); + const response = await integration.mcpClient().callTool({ + name: "aggregate", + arguments: { + database: integration.randomDbName(), + collection: "databases", + pipeline: [ + { + $vectorSearch: { + index: "default", + path: "description_embedding", + queryVector: DOCUMENT_EMBEDDINGS.float, + numCandidates: 10, + limit: 10, + embeddingParameters: { + model: "voyage-3-large", + outputDimension: 256, + outputDType: "float", + }, + filter: { name: 10 }, + }, + }, + { + $project: { + description_embedding: 0, + }, + }, + ], + }, + }); + + expect(!!response.isError).toBe(false); + expect(JSON.stringify(response.content)).toContain( + "The aggregation resulted in 0 documents. Returning 0 documents." + ); + }); }); }, { From e266c5876137d37616c5212b408c089b5497f5f2 Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 16:50:58 +0100 Subject: [PATCH 06/11] chore: move helper into its own independent method --- src/common/search/embeddingsProvider.ts | 47 +------ .../search/vectorSearchEmbeddingsManager.ts | 3 +- .../collectFieldsFromVectorSearchFilter.ts | 68 +++++++++ src/tools/mongodb/mongodbSchemas.ts | 86 ++++++++++++ src/tools/mongodb/read/aggregate.ts | 129 ++---------------- 5 files changed, 175 insertions(+), 158 deletions(-) create mode 100644 src/tools/mongodb/mongodbSchemas.ts diff --git a/src/common/search/embeddingsProvider.ts b/src/common/search/embeddingsProvider.ts index b87906ef2..96b3ea61f 100644 --- a/src/common/search/embeddingsProvider.ts +++ b/src/common/search/embeddingsProvider.ts @@ -4,13 +4,15 @@ import { embedMany } from "ai"; import type { UserConfig } from "../config.js"; import assert from "assert"; import { createFetch } from "@mongodb-js/devtools-proxy-support"; -import { z } from "zod"; +import { + type EmbeddingParameters, + type VoyageEmbeddingParameters, + type VoyageModels, + zVoyageAPIParameters, +} from "../../tools/mongodb/mongodbSchemas.js"; type EmbeddingsInput = string; type Embeddings = number[] | unknown[]; -export type EmbeddingParameters = { - inputType: "query" | "document"; -}; export interface EmbeddingsProvider< SupportedModels extends string, @@ -23,40 +25,6 @@ export interface EmbeddingsProvider< ): Promise; } -export const zVoyageModels = z - .enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"]) - .default("voyage-3-large"); - -// Zod does not undestand JS boxed numbers (like Int32) as integer literals, -// so we preprocess them to unwrap them so Zod understands them. -function unboxNumber(v: unknown): number { - if (v && typeof v === "object" && typeof v.valueOf === "function") { - const n = Number(v.valueOf()); - if (!Number.isNaN(n)) return n; - } - return v as number; -} - -export const zVoyageEmbeddingParameters = z.object({ - outputDimension: z - .preprocess( - unboxNumber, - z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)]) - ) - .optional() - .default(1024), - outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"), -}); - -const zVoyageAPIParameters = zVoyageEmbeddingParameters - .extend({ - inputType: z.enum(["query", "document"]), - }) - .strip(); - -type VoyageModels = z.infer; -type VoyageEmbeddingParameters = z.infer & EmbeddingParameters; - class VoyageEmbeddingsProvider implements EmbeddingsProvider { private readonly voyage: VoyageProvider; @@ -105,6 +73,3 @@ export function getEmbeddingsProvider( return undefined; } - -export const zSupportedEmbeddingParameters = zVoyageEmbeddingParameters.extend({ model: zVoyageModels }); -export type SupportedEmbeddingParameters = z.infer; diff --git a/src/common/search/vectorSearchEmbeddingsManager.ts b/src/common/search/vectorSearchEmbeddingsManager.ts index e570f064b..dfcd4e28e 100644 --- a/src/common/search/vectorSearchEmbeddingsManager.ts +++ b/src/common/search/vectorSearchEmbeddingsManager.ts @@ -5,9 +5,10 @@ import type { ConnectionManager } from "../connectionManager.js"; import z from "zod"; import { ErrorCodes, MongoDBError } from "../errors.js"; import { getEmbeddingsProvider } from "./embeddingsProvider.js"; -import type { EmbeddingParameters, SupportedEmbeddingParameters } from "./embeddingsProvider.js"; +import type { EmbeddingParameters } from "../../tools/mongodb/mongodbSchemas.js"; import { formatUntrustedData } from "../../tools/tool.js"; import type { Similarity } from "../schemas.js"; +import type { SupportedEmbeddingParameters } from "../../tools/mongodb/mongodbSchemas.js"; export const quantizationEnum = z.enum(["none", "scalar", "binary"]); export type Quantization = z.infer; diff --git a/src/helpers/collectFieldsFromVectorSearchFilter.ts b/src/helpers/collectFieldsFromVectorSearchFilter.ts index 2862d4b69..a7db59125 100644 --- a/src/helpers/collectFieldsFromVectorSearchFilter.ts +++ b/src/helpers/collectFieldsFromVectorSearchFilter.ts @@ -1,7 +1,75 @@ // Based on - + +import type z from "zod"; +import { ErrorCodes, MongoDBError } from "../common/errors.js"; +import type { VectorSearchStage } from "../tools/mongodb/mongodbSchemas.js"; +import { type CompositeLogger, LogId } from "../common/logger.js"; + // https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#mongodb-vector-search-pre-filter const ALLOWED_LOGICAL_OPERATORS = ["$not", "$nor", "$and", "$or"]; +export type VectorSearchIndex = { + name: string; + latestDefinition: { + fields: Array< + | { + type: "vector"; + } + | { + type: "filter"; + path: string; + } + >; + }; +}; + +export function assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger, +}: { + searchIndexes: VectorSearchIndex[]; + pipeline: Record[]; + logger: CompositeLogger; +}): void { + const searchIndexesWithFilterFields = searchIndexes.reduce>( + (indexFieldMap, searchIndex) => { + const filterFields = searchIndex.latestDefinition.fields + .map((field) => { + return field.type === "filter" ? field.path : undefined; + }) + .filter((filterField) => filterField !== undefined); + + indexFieldMap[searchIndex.name] = filterFields; + return indexFieldMap; + }, + {} + ); + for (const stage of pipeline) { + if ("$vectorSearch" in stage) { + const { $vectorSearch: vectorSearchStage } = stage as z.infer; + const allowedFilterFields = searchIndexesWithFilterFields[vectorSearchStage.index]; + if (!allowedFilterFields) { + logger.warning({ + id: LogId.toolValidationError, + context: "aggregate tool", + message: `Could not assert if filter fields are indexed - No filter fields found for index ${vectorSearchStage.index}`, + }); + return; + } + + const filterFieldsInStage = collectFieldsFromVectorSearchFilter(vectorSearchStage.filter); + const filterFieldsNotIndexed = filterFieldsInStage.filter((field) => !allowedFilterFields.includes(field)); + if (filterFieldsNotIndexed.length) { + throw new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + `Vector search stage contains filter on fields that are not indexed by index ${vectorSearchStage.index} - ${filterFieldsNotIndexed.join(", ")}` + ); + } + } + } +} + export function collectFieldsFromVectorSearchFilter(filter: unknown): string[] { if (!filter || typeof filter !== "object" || !Object.keys(filter).length) { return []; diff --git a/src/tools/mongodb/mongodbSchemas.ts b/src/tools/mongodb/mongodbSchemas.ts new file mode 100644 index 000000000..cfae16b23 --- /dev/null +++ b/src/tools/mongodb/mongodbSchemas.ts @@ -0,0 +1,86 @@ +import z from "zod"; +import { zEJSON } from "../args.js"; + +export const zVoyageModels = z + .enum(["voyage-3-large", "voyage-3.5", "voyage-3.5-lite", "voyage-code-3"]) + .default("voyage-3-large"); + +// Zod does not undestand JS boxed numbers (like Int32) as integer literals, +// so we preprocess them to unwrap them so Zod understands them. +function unboxNumber(v: unknown): number { + if (v && typeof v === "object" && typeof v.valueOf === "function") { + const n = Number(v.valueOf()); + if (!Number.isNaN(n)) return n; + } + return v as number; +} + +export const zVoyageEmbeddingParameters = z.object({ + outputDimension: z + .preprocess( + unboxNumber, + z.union([z.literal(256), z.literal(512), z.literal(1024), z.literal(2048), z.literal(4096)]) + ) + .optional() + .default(1024), + outputDtype: z.enum(["float", "int8", "uint8", "binary", "ubinary"]).optional().default("float"), +}); + +export const zVoyageAPIParameters = zVoyageEmbeddingParameters + .extend({ + inputType: z.enum(["query", "document"]), + }) + .strip(); + +export type VoyageModels = z.infer; +export type VoyageEmbeddingParameters = z.infer & EmbeddingParameters; + +export type EmbeddingParameters = { + inputType: "query" | "document"; +}; + +export const zSupportedEmbeddingParameters = zVoyageEmbeddingParameters.extend({ model: zVoyageModels }); +export type SupportedEmbeddingParameters = z.infer; + +export const AnyVectorSearchStage = zEJSON(); +export const VectorSearchStage = z.object({ + $vectorSearch: z + .object({ + exact: z + .boolean() + .optional() + .default(false) + .describe( + "When true, uses an ENN algorithm, otherwise uses ANN. Using ENN is not compatible with numCandidates, in that case, numCandidates must be left empty." + ), + index: z.string().describe("Name of the index, as retrieved from the `collection-indexes` tool."), + path: z + .string() + .describe( + "Field, in dot notation, where to search. There must be a vector search index for that field. Note to LLM: When unsure, use the 'collection-indexes' tool to validate that the field is indexed with a vector search index." + ), + queryVector: z + .union([z.string(), z.array(z.number())]) + .describe( + "The content to search for. The embeddingParameters field is mandatory if the queryVector is a string, in that case, the tool generates the embedding automatically using the provided configuration." + ), + numCandidates: z + .number() + .int() + .positive() + .optional() + .describe("Number of candidates for the ANN algorithm. Mandatory when exact is false."), + limit: z.number().int().positive().optional().default(10), + filter: zEJSON() + .optional() + .describe( + "MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering." + ), + embeddingParameters: zSupportedEmbeddingParameters + .optional() + .describe( + "The embedding model and its parameters to use to generate embeddings before searching. It is mandatory if queryVector is a string value. Note to LLM: If unsure, ask the user before providing one." + ), + }) + .passthrough(), +}); diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index a53d8b9fd..5e027e13a 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -11,56 +11,15 @@ import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; import { collectCursorUntilMaxBytesLimit } from "../../../helpers/collectCursorUntilMaxBytes.js"; import { operationWithFallback } from "../../../helpers/operationWithFallback.js"; import { AGG_COUNT_MAX_TIME_MS_CAP, ONE_MB, CURSOR_LIMITS_TO_LLM_TEXT } from "../../../helpers/constants.js"; -import { zEJSON } from "../../args.js"; import { LogId } from "../../../common/logger.js"; -import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; -import { collectFieldsFromVectorSearchFilter } from "../../../helpers/collectFieldsFromVectorSearchFilter.js"; - -const AnyStage = zEJSON(); -const VectorSearchStage = z.object({ - $vectorSearch: z - .object({ - exact: z - .boolean() - .optional() - .default(false) - .describe( - "When true, uses an ENN algorithm, otherwise uses ANN. Using ENN is not compatible with numCandidates, in that case, numCandidates must be left empty." - ), - index: z.string().describe("Name of the index, as retrieved from the `collection-indexes` tool."), - path: z - .string() - .describe( - "Field, in dot notation, where to search. There must be a vector search index for that field. Note to LLM: When unsure, use the 'collection-indexes' tool to validate that the field is indexed with a vector search index." - ), - queryVector: z - .union([z.string(), z.array(z.number())]) - .describe( - "The content to search for. The embeddingParameters field is mandatory if the queryVector is a string, in that case, the tool generates the embedding automatically using the provided configuration." - ), - numCandidates: z - .number() - .int() - .positive() - .optional() - .describe("Number of candidates for the ANN algorithm. Mandatory when exact is false."), - limit: z.number().int().positive().optional().default(10), - filter: zEJSON() - .optional() - .describe( - "MQL filter that can only use filter fields from the index definition. Note to LLM: If unsure, use the `collection-indexes` tool to learn which fields can be used for filtering." - ), - embeddingParameters: zSupportedEmbeddingParameters - .optional() - .describe( - "The embedding model and its parameters to use to generate embeddings before searching. It is mandatory if queryVector is a string value. Note to LLM: If unsure, ask the user before providing one." - ), - }) - .passthrough(), -}); +import { AnyVectorSearchStage, VectorSearchStage } from "../mongodbSchemas.js"; +import { + assertVectorSearchFilterFieldsAreIndexed, + type VectorSearchIndex, +} from "../../../helpers/collectFieldsFromVectorSearchFilter.js"; export const AggregateArgs = { - pipeline: z.array(z.union([AnyStage, VectorSearchStage])).describe( + pipeline: z.array(z.union([AnyVectorSearchStage, VectorSearchStage])).describe( `An array of aggregation stages to execute. \`$vectorSearch\` **MUST** be the first stage of the pipeline, or the first stage of a \`$unionWith\` subpipeline. ### Usage Rules for \`$vectorSearch\` @@ -98,7 +57,13 @@ export class AggregateTool extends MongoDBToolBase { try { const provider = await this.ensureConnected(); await this.assertOnlyUsesPermittedStages(pipeline); - await this.assertVectorSearchFilterFieldsAreIndexed(database, collection, pipeline); + if (await this.session.isSearchSupported()) { + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes: (await provider.getSearchIndexes(database, collection)) as VectorSearchIndex[], + pipeline, + logger: this.session.logger, + }); + } // Check if aggregate operation uses an index if enabled if (this.config.indexCheck) { @@ -220,74 +185,6 @@ export class AggregateTool extends MongoDBToolBase { } } - private async assertVectorSearchFilterFieldsAreIndexed( - database: string, - collection: string, - pipeline: Record[] - ): Promise { - if (!(await this.session.isSearchSupported())) { - return; - } - - const searchIndexesWithFilterFields = await this.searchIndexesWithFilterFields(database, collection); - for (const stage of pipeline) { - if ("$vectorSearch" in stage) { - const { $vectorSearch: vectorSearchStage } = stage as z.infer; - const allowedFilterFields = searchIndexesWithFilterFields[vectorSearchStage.index]; - if (!allowedFilterFields) { - this.session.logger.warning({ - id: LogId.toolValidationError, - context: "aggregate tool", - message: `Could not assert if filter fields are indexed - No filter fields found for index ${vectorSearchStage.index}`, - }); - return; - } - - const filterFieldsInStage = collectFieldsFromVectorSearchFilter(vectorSearchStage.filter); - const filterFieldsNotIndexed = filterFieldsInStage.filter( - (field) => !allowedFilterFields.includes(field) - ); - if (filterFieldsNotIndexed.length) { - throw new MongoDBError( - ErrorCodes.AtlasVectorSearchInvalidQuery, - `Vector search stage contains filter on fields that are not indexed by index ${vectorSearchStage.index} - ${filterFieldsNotIndexed.join(", ")}` - ); - } - } - } - } - - private async searchIndexesWithFilterFields( - database: string, - collection: string - ): Promise> { - const searchIndexes = (await this.session.serviceProvider.getSearchIndexes(database, collection)) as Array<{ - name: string; - latestDefinition: { - fields: Array< - | { - type: "vector"; - } - | { - type: "filter"; - path: string; - } - >; - }; - }>; - - return searchIndexes.reduce>((indexFieldMap, searchIndex) => { - const filterFields = searchIndex.latestDefinition.fields - .map((field) => { - return field.type === "filter" ? field.path : undefined; - }) - .filter((filterField) => filterField !== undefined); - - indexFieldMap[searchIndex.name] = filterFields; - return indexFieldMap; - }, {}); - } - private async countAggregationResultDocuments({ provider, database, From a1eccb97a58eed4aee42308b9a0d90eab94119ff Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 16:53:13 +0100 Subject: [PATCH 07/11] chore: rename file --- ...rchFilter.ts => assertVectorSearchFilterFieldsAreIndexed.ts} | 0 src/tools/mongodb/read/aggregate.ts | 2 +- tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename src/helpers/{collectFieldsFromVectorSearchFilter.ts => assertVectorSearchFilterFieldsAreIndexed.ts} (100%) diff --git a/src/helpers/collectFieldsFromVectorSearchFilter.ts b/src/helpers/assertVectorSearchFilterFieldsAreIndexed.ts similarity index 100% rename from src/helpers/collectFieldsFromVectorSearchFilter.ts rename to src/helpers/assertVectorSearchFilterFieldsAreIndexed.ts diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index 5e027e13a..d6a624cf7 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -16,7 +16,7 @@ import { AnyVectorSearchStage, VectorSearchStage } from "../mongodbSchemas.js"; import { assertVectorSearchFilterFieldsAreIndexed, type VectorSearchIndex, -} from "../../../helpers/collectFieldsFromVectorSearchFilter.js"; +} from "../../../helpers/assertVectorSearchFilterFieldsAreIndexed.js"; export const AggregateArgs = { pipeline: z.array(z.union([AnyVectorSearchStage, VectorSearchStage])).describe( diff --git a/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts b/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts index d4ce7f728..52a8cc005 100644 --- a/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts +++ b/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { collectFieldsFromVectorSearchFilter } from "../../../src/helpers/collectFieldsFromVectorSearchFilter.js"; +import { collectFieldsFromVectorSearchFilter } from "../../../src/helpers/assertVectorSearchFilterFieldsAreIndexed.js"; describe("#collectFieldsFromVectorSearchFilter", () => { it("should return empty list if filter is not an object or an empty object", () => { From 15780faccb7d03d809ba751d5765c9cad510f5d8 Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 16:53:26 +0100 Subject: [PATCH 08/11] chore: rename test --- ...r.test.ts => assertVectorSearchFilterFieldsAreIndexed.test.ts} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/unit/helpers/{collectFieldsFromVectorSearchFilter.test.ts => assertVectorSearchFilterFieldsAreIndexed.test.ts} (100%) diff --git a/tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts b/tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts similarity index 100% rename from tests/unit/helpers/collectFieldsFromVectorSearchFilter.test.ts rename to tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts From ac095d4f8756ad0040863de286d0c94c167b7088 Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 16:57:16 +0100 Subject: [PATCH 09/11] chore: fix build --- src/tools/mongodb/create/insertMany.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 86aec3203..fcde13164 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -4,7 +4,7 @@ import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { type ToolArgs, type OperationType, formatUntrustedData } from "../../tool.js"; import { zEJSON } from "../../args.js"; import { type Document } from "bson"; -import { zSupportedEmbeddingParameters } from "../../../common/search/embeddingsProvider.js"; +import { zSupportedEmbeddingParameters } from "../mongodbSchemas.js"; import { ErrorCodes, MongoDBError } from "../../../common/errors.js"; const zSupportedEmbeddingParametersWithInput = zSupportedEmbeddingParameters.extend({ From 69c7fb7ac31919d535fef5d5233a2d0b10fc872a Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 17:03:19 +0100 Subject: [PATCH 10/11] chore: fix style --- tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts index 9b00e2e38..2bf05146c 100644 --- a/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts +++ b/tests/unit/common/search/vectorSearchEmbeddingsManager.test.ts @@ -14,10 +14,10 @@ import type { InsertOneResult } from "mongodb"; import type { DropDatabaseResult } from "@mongosh/service-provider-node-driver/lib/node-driver-service-provider.js"; import EventEmitter from "events"; import { - type EmbeddingParameters, type EmbeddingsProvider, type getEmbeddingsProvider, } from "../../../../src/common/search/embeddingsProvider.js"; +import type { EmbeddingParameters } from "../../../../src/tools/mongodb/mongodbSchemas.js"; type MockedServiceProvider = NodeDriverServiceProvider & { getSearchIndexes: MockedFunction; From dfbb803fa483ed3697c051d92d4ff85f90294f2a Mon Sep 17 00:00:00 2001 From: gagik Date: Fri, 31 Oct 2025 17:25:30 +0100 Subject: [PATCH 11/11] chore: add more tests to assert --- ...VectorSearchFilterFieldsAreIndexed.test.ts | 419 +++++++++++++++++- 1 file changed, 417 insertions(+), 2 deletions(-) diff --git a/tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts b/tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts index 52a8cc005..a437a916d 100644 --- a/tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts +++ b/tests/unit/helpers/assertVectorSearchFilterFieldsAreIndexed.test.ts @@ -1,5 +1,11 @@ -import { describe, expect, it } from "vitest"; -import { collectFieldsFromVectorSearchFilter } from "../../../src/helpers/assertVectorSearchFilterFieldsAreIndexed.js"; +import { describe, expect, it, vi } from "vitest"; +import { + assertVectorSearchFilterFieldsAreIndexed, + collectFieldsFromVectorSearchFilter, + type VectorSearchIndex, +} from "../../../src/helpers/assertVectorSearchFilterFieldsAreIndexed.js"; +import { ErrorCodes, MongoDBError } from "../../../src/common/errors.js"; +import { type CompositeLogger, LogId } from "../../../src/common/logger.js"; describe("#collectFieldsFromVectorSearchFilter", () => { it("should return empty list if filter is not an object or an empty object", () => { @@ -169,3 +175,412 @@ describe("#collectFieldsFromVectorSearchFilter", () => { ]); }); }); + +describe("#assertVectorSearchFilterFieldsAreIndexed", () => { + const mockLogger = { + debug: vi.fn(), + info: vi.fn(), + warning: vi.fn(), + error: vi.fn(), + } as unknown as CompositeLogger; + + const createMockSearchIndexes = (indexName: string, filterFields: string[]): VectorSearchIndex[] => [ + { + name: indexName, + latestDefinition: { + fields: [ + { type: "vector" as const }, + ...filterFields.map((field) => ({ + type: "filter" as const, + path: field, + })), + ], + }, + }, + ]; + + it("should not throw when all filter fields are indexed", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1", "field2", "field3"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + field2: { $eq: 10 }, + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should not throw when filter is empty", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: {}, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should not throw when filter is not provided", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should not throw when pipeline has no $vectorSearch stage", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1"]); + const pipeline = [{ $match: { status: "active" } }, { $limit: 10 }]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should throw MongoDBError when filter field is not indexed", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1", "field2"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + field3: { $eq: 10 }, // field3 is not indexed + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow(MongoDBError); + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow( + new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Vector search stage contains filter on fields that are not indexed by index myIndex - field3" + ) + ); + }); + + it("should throw MongoDBError with all unindexed fields listed", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + field2: { $eq: 10 }, + field3: { $gt: 5 }, + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow( + new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Vector search stage contains filter on fields that are not indexed by index myIndex - field2, field3" + ) + ); + }); + + it("should handle nested $and and $or operators", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1", "field2", "field3"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + $or: [ + { field1: "value" }, + { + $and: [{ field2: { $eq: 10 } }, { field3: { $gt: 5 } }], + }, + ], + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should throw when nested filter contains unindexed field", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1", "field2"]); + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + $or: [ + { field1: "value" }, + { + $and: [{ field2: { $eq: 10 } }, { field4: { $gt: 5 } }], // field4 not indexed + }, + ], + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow( + new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Vector search stage contains filter on fields that are not indexed by index myIndex - field4" + ) + ); + }); + + it("should log warning when index is not found in searchIndexes", () => { + const searchIndexes = createMockSearchIndexes("myIndex", ["field1"]); + const pipeline = [ + { + $vectorSearch: { + index: "nonExistentIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + }, + }, + }, + ]; + + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }); + + // eslint-disable-next-line @typescript-eslint/unbound-method + expect(mockLogger.warning).toHaveBeenCalledWith({ + id: LogId.toolValidationError, + context: "aggregate tool", + message: + "Could not assert if filter fields are indexed - No filter fields found for index nonExistentIndex", + }); + }); + + it("should handle multiple $vectorSearch stages in pipeline", () => { + const searchIndexes = [ + ...createMockSearchIndexes("index1", ["field1", "field2"]), + ...createMockSearchIndexes("index2", ["field3", "field4"]), + ]; + const pipeline = [ + { + $vectorSearch: { + index: "index1", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + }, + }, + }, + { $limit: 5 }, + { + $vectorSearch: { + index: "index2", + path: "embedding2", + queryVector: [4, 5, 6], + numCandidates: 50, + limit: 5, + filter: { + field3: "value2", + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).not.toThrow(); + }); + + it("should throw on second $vectorSearch stage if it has unindexed field", () => { + const searchIndexes = [ + ...createMockSearchIndexes("index1", ["field1", "field2"]), + ...createMockSearchIndexes("index2", ["field3"]), + ]; + const pipeline = [ + { + $vectorSearch: { + index: "index1", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + }, + }, + }, + { + $vectorSearch: { + index: "index2", + path: "embedding2", + queryVector: [4, 5, 6], + numCandidates: 50, + limit: 5, + filter: { + field4: "value2", // field4 not indexed in index2 + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow( + new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Vector search stage contains filter on fields that are not indexed by index index2 - field4" + ) + ); + }); + + it("should handle search index with no filter fields", () => { + const searchIndexes: VectorSearchIndex[] = [ + { + name: "myIndex", + latestDefinition: { + fields: [{ type: "vector" }], + }, + }, + ]; + const pipeline = [ + { + $vectorSearch: { + index: "myIndex", + path: "embedding", + queryVector: [1, 2, 3], + numCandidates: 100, + limit: 10, + filter: { + field1: "value", + }, + }, + }, + ]; + + expect(() => + assertVectorSearchFilterFieldsAreIndexed({ + searchIndexes, + pipeline, + logger: mockLogger, + }) + ).toThrow( + new MongoDBError( + ErrorCodes.AtlasVectorSearchInvalidQuery, + "Vector search stage contains filter on fields that are not indexed by index myIndex - field1" + ) + ); + }); +});