From 39aa3acb9a014f8a9f80fd540c3efcd8747d2140 Mon Sep 17 00:00:00 2001 From: Kevin Mas Ruiz Date: Mon, 22 Sep 2025 12:21:53 +0200 Subject: [PATCH 1/5] fix(schema): Use sample instead of find for schema sampling. --- src/tools/mongodb/metadata/collectionSchema.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index fa6ea3c0d..2ba9d09a0 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -13,7 +13,9 @@ export class CollectionSchemaTool extends MongoDBToolBase { protected async execute({ database, collection }: ToolArgs): Promise { const provider = await this.ensureConnected(); - const documents = await provider.find(database, collection, {}, { limit: 5 }).toArray(); + const documents = await provider.aggregate(database, collection, [ + { $sample: { size: 50 } }, + ]).toArray(); const schema = await getSimplifiedSchema(documents); const fieldsCount = Object.entries(schema).length; From 8206acc5c1371a1bb43d8abe67f57f01f01316b0 Mon Sep 17 00:00:00 2001 From: Kevin Mas Ruiz Date: Mon, 22 Sep 2025 13:25:25 +0200 Subject: [PATCH 2/5] chore: use memory limits and support custom sample size --- .../mongodb/metadata/collectionSchema.ts | 31 ++++++++++++++----- .../mongodb/metadata/collectionSchema.test.ts | 16 +++++++++- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 2ba9d09a0..26a9be0be 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -1,21 +1,35 @@ import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; -import type { ToolArgs, OperationType } from "../../tool.js"; +import type { ToolArgs, OperationType, ToolExecutionContext } from "../../tool.js"; import { formatUntrustedData } from "../../tool.js"; import { getSimplifiedSchema } from "mongodb-schema"; +import z from "zod"; +import { ONE_MB } from "../../../helpers/constants.js"; +import { collectCursorUntilMaxBytesLimit } from "../../../helpers/collectCursorUntilMaxBytes.js"; export class CollectionSchemaTool extends MongoDBToolBase { public name = "collection-schema"; protected description = "Describe the schema for a collection"; - protected argsShape = DbOperationArgs; + protected argsShape = { + ...DbOperationArgs, + sampleSize: z.number().optional().default(50).describe("Number of documents to sample for schema inference"), + responseBytesLimit: z.number().optional().default(ONE_MB).describe(`The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.`), + }; public operationType: OperationType = "metadata"; - protected async execute({ database, collection }: ToolArgs): Promise { + protected async execute( + { database, collection, sampleSize, responseBytesLimit }: ToolArgs, + { signal }: ToolExecutionContext + ): Promise { const provider = await this.ensureConnected(); - const documents = await provider.aggregate(database, collection, [ - { $sample: { size: 50 } }, - ]).toArray(); + const cursor = provider.aggregate(database, collection, [{ $sample: { size: Math.min(sampleSize, this.config.maxDocumentsPerQuery) } }]); + const { cappedBy, documents } = await collectCursorUntilMaxBytesLimit({ + cursor, + configuredMaxBytesPerQuery: this.config.maxBytesPerQuery, + toolResponseBytesLimit: responseBytesLimit, + abortSignal: signal, + }); const schema = await getSimplifiedSchema(documents); const fieldsCount = Object.entries(schema).length; @@ -30,9 +44,12 @@ export class CollectionSchemaTool extends MongoDBToolBase { }; } + const header = `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`; + const cappedWarning = cappedBy !== undefined ? `\nThe schema was inferred from a subset of documents due to the response size limit. (${cappedBy})` : ""; + return { content: formatUntrustedData( - `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, + `${header}${cappedWarning}`, JSON.stringify(schema) ), }; diff --git a/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts b/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts index 4130da1f8..0f2a97a0c 100644 --- a/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts +++ b/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts @@ -19,7 +19,21 @@ describeWithMongoDB("collectionSchema tool", (integration) => { integration, "collection-schema", "Describe the schema for a collection", - databaseCollectionParameters + [ + ...databaseCollectionParameters, + { + name: "sampleSize", + type: "number", + description: "Number of documents to sample for schema inference", + required: false, + }, + { + name: "responseBytesLimit", + type: "number", + description: `The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.`, + required: false, + } + ] ); validateThrowsForInvalidArguments(integration, "collection-schema", databaseCollectionInvalidArgs); From 6b81bd3b27a8ce6fc9e03837f04a54534a5dc20e Mon Sep 17 00:00:00 2001 From: Kevin Mas Ruiz Date: Mon, 22 Sep 2025 13:29:07 +0200 Subject: [PATCH 3/5] chore: fix build --- src/tools/mongodb/metadata/collectionSchema.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 26a9be0be..a01cc0694 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -19,7 +19,7 @@ export class CollectionSchemaTool extends MongoDBToolBase { public operationType: OperationType = "metadata"; protected async execute( - { database, collection, sampleSize, responseBytesLimit }: ToolArgs, + { database, collection, sampleSize, responseBytesLimit }: ToolArgs, { signal }: ToolExecutionContext ): Promise { const provider = await this.ensureConnected(); From e0324c21d2d68c346b6443740a8bc610046a6bc7 Mon Sep 17 00:00:00 2001 From: Kevin Mas Ruiz Date: Mon, 22 Sep 2025 13:43:58 +0200 Subject: [PATCH 4/5] chore: use custom isObjectEmpty that is O(1) instead of O(N). Important for large schemas --- src/helpers/isObjectEmpty.ts | 15 ++++++++++++++ .../mongodb/metadata/collectionSchema.ts | 4 ++-- .../integration/common/isObjectEmpty.test.ts | 20 +++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 src/helpers/isObjectEmpty.ts create mode 100644 tests/integration/common/isObjectEmpty.test.ts diff --git a/src/helpers/isObjectEmpty.ts b/src/helpers/isObjectEmpty.ts new file mode 100644 index 000000000..7584c2f51 --- /dev/null +++ b/src/helpers/isObjectEmpty.ts @@ -0,0 +1,15 @@ +type EmptyObject = { [x: string]: never } | null | undefined; + +export function isObjectEmpty(value: object | null | undefined): value is EmptyObject { + if (!value) { + return true; + } + + for (const prop in value) { + if (Object.prototype.hasOwnProperty.call(value, prop)) { + return false; + } + } + + return true; +} diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index a01cc0694..0af61f783 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -6,6 +6,7 @@ import { getSimplifiedSchema } from "mongodb-schema"; import z from "zod"; import { ONE_MB } from "../../../helpers/constants.js"; import { collectCursorUntilMaxBytesLimit } from "../../../helpers/collectCursorUntilMaxBytes.js"; +import { isObjectEmpty } from "../../../helpers/isObjectEmpty.js"; export class CollectionSchemaTool extends MongoDBToolBase { public name = "collection-schema"; @@ -32,8 +33,7 @@ export class CollectionSchemaTool extends MongoDBToolBase { }); const schema = await getSimplifiedSchema(documents); - const fieldsCount = Object.entries(schema).length; - if (fieldsCount === 0) { + if (isObjectEmpty(schema)) { return { content: [ { diff --git a/tests/integration/common/isObjectEmpty.test.ts b/tests/integration/common/isObjectEmpty.test.ts new file mode 100644 index 000000000..35827034b --- /dev/null +++ b/tests/integration/common/isObjectEmpty.test.ts @@ -0,0 +1,20 @@ +import { isObjectEmpty } from "../../../src/helpers/isObjectEmpty.js"; +import { describe, expect, it } from "vitest"; + +describe("isObjectEmpty", () => { + it("returns true for null", () => { + expect(isObjectEmpty(null)).toBe(true); + }); + + it("returns true for undefined", () => { + expect(isObjectEmpty(undefined)).toBe(true); + }); + + it("returns true for empty object", () => { + expect(isObjectEmpty({})).toBe(true); + }); + + it("returns false for object with properties", () => { + expect(isObjectEmpty({ a: 1 })).toBe(false); + }); +}); From 8f31898169c1f93c5f78415199ce6baa391de362 Mon Sep 17 00:00:00 2001 From: Kevin Mas Ruiz Date: Mon, 22 Sep 2025 13:52:56 +0200 Subject: [PATCH 5/5] chore: Use a hardcoded constant for the maximum upper bounds instead of config.maxDocumentsPerQuery --- .../mongodb/metadata/collectionSchema.ts | 25 +++++++++---- .../integration/common/isObjectEmpty.test.ts | 2 +- .../mongodb/metadata/collectionSchema.test.ts | 35 ++++++++----------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 0af61f783..f03e9b9d1 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -8,13 +8,21 @@ import { ONE_MB } from "../../../helpers/constants.js"; import { collectCursorUntilMaxBytesLimit } from "../../../helpers/collectCursorUntilMaxBytes.js"; import { isObjectEmpty } from "../../../helpers/isObjectEmpty.js"; +const MAXIMUM_SAMPLE_SIZE_HARD_LIMIT = 50_000; + export class CollectionSchemaTool extends MongoDBToolBase { public name = "collection-schema"; protected description = "Describe the schema for a collection"; protected argsShape = { ...DbOperationArgs, sampleSize: z.number().optional().default(50).describe("Number of documents to sample for schema inference"), - responseBytesLimit: z.number().optional().default(ONE_MB).describe(`The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.`), + responseBytesLimit: z + .number() + .optional() + .default(ONE_MB) + .describe( + `The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.` + ), }; public operationType: OperationType = "metadata"; @@ -24,7 +32,9 @@ export class CollectionSchemaTool extends MongoDBToolBase { { signal }: ToolExecutionContext ): Promise { const provider = await this.ensureConnected(); - const cursor = provider.aggregate(database, collection, [{ $sample: { size: Math.min(sampleSize, this.config.maxDocumentsPerQuery) } }]); + const cursor = provider.aggregate(database, collection, [ + { $sample: { size: Math.min(sampleSize, MAXIMUM_SAMPLE_SIZE_HARD_LIMIT) } }, + ]); const { cappedBy, documents } = await collectCursorUntilMaxBytesLimit({ cursor, configuredMaxBytesPerQuery: this.config.maxBytesPerQuery, @@ -44,14 +54,15 @@ export class CollectionSchemaTool extends MongoDBToolBase { }; } + const fieldsCount = Object.keys(schema).length; const header = `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`; - const cappedWarning = cappedBy !== undefined ? `\nThe schema was inferred from a subset of documents due to the response size limit. (${cappedBy})` : ""; + const cappedWarning = + cappedBy !== undefined + ? `\nThe schema was inferred from a subset of documents due to the response size limit. (${cappedBy})` + : ""; return { - content: formatUntrustedData( - `${header}${cappedWarning}`, - JSON.stringify(schema) - ), + content: formatUntrustedData(`${header}${cappedWarning}`, JSON.stringify(schema)), }; } } diff --git a/tests/integration/common/isObjectEmpty.test.ts b/tests/integration/common/isObjectEmpty.test.ts index 35827034b..5c1b80571 100644 --- a/tests/integration/common/isObjectEmpty.test.ts +++ b/tests/integration/common/isObjectEmpty.test.ts @@ -5,7 +5,7 @@ describe("isObjectEmpty", () => { it("returns true for null", () => { expect(isObjectEmpty(null)).toBe(true); }); - + it("returns true for undefined", () => { expect(isObjectEmpty(undefined)).toBe(true); }); diff --git a/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts b/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts index 0f2a97a0c..47f117b28 100644 --- a/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts +++ b/tests/integration/tools/mongodb/metadata/collectionSchema.test.ts @@ -15,26 +15,21 @@ import type { SimplifiedSchema } from "mongodb-schema"; import { describe, expect, it } from "vitest"; describeWithMongoDB("collectionSchema tool", (integration) => { - validateToolMetadata( - integration, - "collection-schema", - "Describe the schema for a collection", - [ - ...databaseCollectionParameters, - { - name: "sampleSize", - type: "number", - description: "Number of documents to sample for schema inference", - required: false, - }, - { - name: "responseBytesLimit", - type: "number", - description: `The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.`, - required: false, - } - ] - ); + validateToolMetadata(integration, "collection-schema", "Describe the schema for a collection", [ + ...databaseCollectionParameters, + { + name: "sampleSize", + type: "number", + description: "Number of documents to sample for schema inference", + required: false, + }, + { + name: "responseBytesLimit", + type: "number", + description: `The maximum number of bytes to return in the response. This value is capped by the server’s configured maxBytesPerQuery and cannot be exceeded.`, + required: false, + }, + ]); validateThrowsForInvalidArguments(integration, "collection-schema", databaseCollectionInvalidArgs);