From 45abd9fc5130eea30b1c8dc2f288decf2269e23e Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Sat, 28 Jun 2025 16:14:03 +0200 Subject: [PATCH 01/47] chore: LangChain based accuracy tests --- package-lock.json | 346 +++++++++++++++++- tests/accuracy/list-databases.test.ts | 26 ++ tests/accuracy/sdk/accuracy-scorers.ts | 125 +++++++ tests/accuracy/sdk/describe-accuracy-tests.ts | 51 +++ tests/accuracy/sdk/models.ts | 62 ++++ tests/accuracy/sdk/test-tools.ts | 153 ++++++++ tests/accuracy/sdk/tool-calling-agent.ts | 36 ++ 7 files changed, 795 insertions(+), 4 deletions(-) create mode 100644 tests/accuracy/list-databases.test.ts create mode 100644 tests/accuracy/sdk/accuracy-scorers.ts create mode 100644 tests/accuracy/sdk/describe-accuracy-tests.ts create mode 100644 tests/accuracy/sdk/models.ts create mode 100644 tests/accuracy/sdk/test-tools.ts create mode 100644 tests/accuracy/sdk/tool-calling-agent.ts diff --git a/package-lock.json b/package-lock.json index 29132ba3..6e67a1f9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1267,6 +1267,13 @@ "dev": true, "license": "MIT" }, + "node_modules/@cfworker/json-schema": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz", + "integrity": "sha512-gAmrUZSGtKc3AiBL71iNWxDsyUC5uMaKKGdvzYsBoTW/xi42JQHl7eKV2OYzCUqvc+D2RCcf7EXY2iCyFIk6og==", + "dev": true, + "license": "MIT" + }, "node_modules/@cspotcode/source-map-support": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", @@ -2043,6 +2050,16 @@ "dev": true, "license": "MIT" }, + "node_modules/@google/generative-ai": { + "version": "0.24.1", + "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.1.tgz", + "integrity": "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@hapi/boom": { "version": "10.0.1", "resolved": "https://registry.npmjs.org/@hapi/boom/-/boom-10.0.1.tgz", @@ -2929,6 +2946,152 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@langchain/core": { + "version": "0.3.61", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.61.tgz", + "integrity": "sha512-4O7fw5SXNSE+uBnathLQrhm3t+7dZGagt/5kt37A+pXw0AkudxEBvveg73sSnpBd9SIz3/Vc7F4k8rCKXGbEDA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@cfworker/json-schema": "^4.0.2", + "ansi-styles": "^5.0.0", + "camelcase": "6", + "decamelize": "1.2.0", + "js-tiktoken": "^1.0.12", + "langsmith": "^0.3.33", + "mustache": "^4.2.0", + "p-queue": "^6.6.2", + "p-retry": "4", + "uuid": "^10.0.0", + "zod": "^3.25.32", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/core/node_modules/ansi-styles": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", + "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/@langchain/core/node_modules/camelcase": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", + "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@langchain/core/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "dev": true, + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/google-genai": { + "version": "0.2.14", + "resolved": "https://registry.npmjs.org/@langchain/google-genai/-/google-genai-0.2.14.tgz", + "integrity": "sha512-gKe/T2LNh8wSSMJOaFmYd8cwQnDSXKtVtC6a7CFoq5nWuh0bKzhItM/7bue1aMN8mlKfB2G1HCwxhaZoSpS/DA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@google/generative-ai": "^0.24.0", + "uuid": "^11.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/ollama": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/@langchain/ollama/-/ollama-0.2.3.tgz", + "integrity": "sha512-1Obe45jgQspqLMBVlayQbGdywFmri8DgmGRdzNu0li56cG5RReYlRCFVDZBRMMvF9JhsP5eXRyfyivtKfITHWQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "ollama": "^0.5.12", + "uuid": "^10.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/ollama/node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "dev": true, + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/@langchain/openai": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.5.16.tgz", + "integrity": "sha512-TqzPE3PM0bMkQi53qs8vCFkwaEp3VgwGw+s1e8Nas5ICCZZtc2XqcDPz4hf2gpo1k7/AZd6HuPlAsDy6wye9Qw==", + "dev": true, + "license": "MIT", + "dependencies": { + "js-tiktoken": "^1.0.12", + "openai": "^5.3.0", + "zod": "^3.25.32" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.3.58 <0.4.0" + } + }, + "node_modules/@langchain/textsplitters": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", + "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", + "dev": true, + "license": "MIT", + "dependencies": { + "js-tiktoken": "^1.0.12" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@langchain/core": ">=0.2.21 <0.4.0" + } + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.0", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.0.tgz", @@ -5424,6 +5587,19 @@ "node": ">=18.0.0" } }, + "node_modules/@smithy/middleware-retry/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@smithy/middleware-serde": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.0.3.tgz", @@ -6016,6 +6192,13 @@ "undici-types": "~7.8.0" } }, + "node_modules/@types/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -6045,6 +6228,13 @@ "license": "MIT", "optional": true }, + "node_modules/@types/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/webidl-conversions": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", @@ -7767,6 +7957,16 @@ "node": ">=12" } }, + "node_modules/console-table-printer": { + "version": "2.14.6", + "resolved": "https://registry.npmjs.org/console-table-printer/-/console-table-printer-2.14.6.tgz", + "integrity": "sha512-MCBl5HNVaFuuHW6FGbL/4fB7N/ormCy+tQ+sxTrF6QtSbSNETvPuOVbkJBhzDgYhvjWGrTma4eYJa37ZuoQsPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "simple-wcswidth": "^1.0.1" + } + }, "node_modules/content-disposition": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.0.tgz", @@ -7935,6 +8135,16 @@ } } }, + "node_modules/decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/decko": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz", @@ -11730,6 +11940,16 @@ "node": ">=0.10.0" } }, + "node_modules/js-tiktoken": { + "version": "1.0.20", + "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.20.tgz", + "integrity": "sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "base64-js": "^1.5.1" + } + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -12653,6 +12873,16 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, + "node_modules/mustache": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", + "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", + "dev": true, + "license": "MIT", + "bin": { + "mustache": "bin/mustache" + } + }, "node_modules/nan": { "version": "2.22.2", "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz", @@ -13011,6 +13241,16 @@ "node": "^10.13.0 || >=12.0.0" } }, + "node_modules/ollama": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.16.tgz", + "integrity": "sha512-OEbxxOIUZtdZgOaTPAULo051F5y+Z1vosxEYOoABPnQKeW7i4O8tJNlxCB+xioyoorVqgjkdj+TA1f1Hy2ug/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-fetch": "^3.6.20" + } + }, "node_modules/on-finished": { "version": "2.4.1", "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", @@ -13065,6 +13305,28 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/openai": { + "version": "5.8.2", + "resolved": "https://registry.npmjs.org/openai/-/openai-5.8.2.tgz", + "integrity": "sha512-8C+nzoHYgyYOXhHGN6r0fcb4SznuEn1R7YZMvlqDbnCuE0FM2mm3T1HiYW6WIcMS/F1Of2up/cSPjLPaWt0X9Q==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/openapi-fetch": { "version": "0.14.0", "resolved": "https://registry.npmjs.org/openapi-fetch/-/openapi-fetch-0.14.0.tgz", @@ -13266,6 +13528,16 @@ "dev": true, "license": "MIT" }, + "node_modules/p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -13298,6 +13570,57 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/p-queue": { + "version": "6.6.2", + "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", + "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "eventemitter3": "^4.0.4", + "p-timeout": "^3.2.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-queue/node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", + "dev": true, + "license": "MIT" + }, + "node_modules/p-retry": { + "version": "4.6.2", + "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz", + "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/retry": "0.12.0", + "retry": "^0.13.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/p-timeout": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", + "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", + "dev": true, + "license": "MIT", + "dependencies": { + "p-finally": "^1.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", @@ -14842,6 +15165,13 @@ "joi": "^17.6.4" } }, + "node_modules/simple-wcswidth": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/simple-wcswidth/-/simple-wcswidth-1.1.2.tgz", + "integrity": "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw==", + "dev": true, + "license": "MIT" + }, "node_modules/simple-websocket": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz", @@ -16163,16 +16493,17 @@ } }, "node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz", + "integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" ], "license": "MIT", "bin": { - "uuid": "dist/bin/uuid" + "uuid": "dist/esm/bin/uuid" } }, "node_modules/v8-compile-cache-lib": { @@ -16234,6 +16565,13 @@ "node": ">=12" } }, + "node_modules/whatwg-fetch": { + "version": "3.6.20", + "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz", + "integrity": "sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==", + "dev": true, + "license": "MIT" + }, "node_modules/whatwg-url": { "version": "14.2.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts new file mode 100644 index 00000000..ae3f6c7d --- /dev/null +++ b/tests/accuracy/list-databases.test.ts @@ -0,0 +1,26 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; + +describeAccuracyTests("list-databases", getAvailableModels(), [ + { + prompt: "Assume that you're already connected. How many collections are there in sample_mflix database", + mockedTools: { + "list-collections": function listCollections() { + return { + content: [ + { + type: "text", + text: "Name: coll1", + }, + ], + }; + }, + }, + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "sample_mflix" }, + }, + ], + }, +]); diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts new file mode 100644 index 00000000..bf92eead --- /dev/null +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -0,0 +1,125 @@ +export type ToolCall = { + toolCallId: string; + toolName: string; + parameters: unknown; +}; +export type ExpectedToolCall = Omit; + +export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number { + if (actualToolCalls.length < expectedToolCalls.length) { + return 0; + } + + const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + const checkedToolCallIds = new Set(); + for (const expectedToolCall of expectedToolCalls) { + const matchingActualToolCall = actualToolCalls.find( + (actualToolCall) => + actualToolCall.toolName === expectedToolCall.toolName && + !checkedToolCallIds.has(actualToolCall.toolCallId) + ); + + if (!matchingActualToolCall) { + return 0; + } + + checkedToolCallIds.add(matchingActualToolCall.toolCallId); + } + + return possibleScore; +} + +export function parameterMatchingAccuracyScorer( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: ToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return 1; + } + + const toolCallScores: number[] = []; + const checkedToolCallIds = new Set(); + + for (const expectedToolCall of expectedToolCalls) { + const matchingActualToolCall = actualToolCalls.find( + (actualToolCall) => + actualToolCall.toolName === expectedToolCall.toolName && + !checkedToolCallIds.has(actualToolCall.toolCallId) + ); + + if (!matchingActualToolCall) { + toolCallScores.push(0); + continue; + } + + checkedToolCallIds.add(matchingActualToolCall.toolCallId); + const score = compareParams(expectedToolCall.parameters, matchingActualToolCall.parameters); + toolCallScores.push(score); + } + + const totalScore = toolCallScores.reduce((sum, score) => sum + score, 0); + return totalScore / toolCallScores.length; +} + +/** + * Recursively compares expected and actual parameters and returns a score. + * - 1: Perfect match. + * - 0.75: All expected parameters are present and match, but there are extra actual parameters. + * - 0: Missing parameters or mismatched values. + */ +function compareParams(expected: unknown, actual: unknown): number { + if (expected === null || expected === undefined) { + return actual === null || actual === undefined ? 1 : 0; + } + if (actual === null || actual === undefined) { + return 0; + } + + if (Array.isArray(expected)) { + if (!Array.isArray(actual) || actual.length < expected.length) { + return 0; + } + let minScore = 1; + for (let i = 0; i < expected.length; i++) { + minScore = Math.min(minScore, compareParams(expected[i], actual[i])); + } + if (minScore === 0) { + return 0; + } + if (actual.length > expected.length) { + minScore = Math.min(minScore, 0.75); + } + return minScore; + } + + if (typeof expected === "object") { + if (typeof actual !== "object" || Array.isArray(actual)) { + return 0; + } + const expectedKeys = Object.keys(expected as Record); + const actualKeys = Object.keys(actual as Record); + + let minScore = 1; + for (const key of expectedKeys) { + if (!Object.prototype.hasOwnProperty.call(actual, key)) { + return 0; + } + minScore = Math.min( + minScore, + compareParams((expected as Record)[key], (actual as Record)[key]) + ); + } + + if (minScore === 0) { + return 0; + } + + if (actualKeys.length > expectedKeys.length) { + minScore = Math.min(minScore, 0.75); + } + return minScore; + } + + // eslint-disable-next-line eqeqeq + return expected == actual ? 1 : 0; +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts new file mode 100644 index 00000000..0ec4bb64 --- /dev/null +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -0,0 +1,51 @@ +import { AgentExecutor } from "langchain/agents"; +import { Tool } from "@modelcontextprotocol/sdk/types.js"; +import { discoverMongoDBTools, TestTools, ToolResultGenerators } from "./test-tools.js"; +import { TestableModels } from "./models.js"; +import { getToolCallingAgent } from "./tool-calling-agent.js"; +import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; + +interface AccuracyTestConfig { + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + mockedTools: ToolResultGenerators; +} + +export function describeAccuracyTests( + suiteName: string, + models: TestableModels, + accuracyTestConfigs: AccuracyTestConfig[] +) { + const eachModel = describe.each(models); + const eachTest = it.each(accuracyTestConfigs); + + eachModel(`$modelName - ${suiteName}`, function (model) { + let mcpTools: Tool[]; + let testTools: TestTools; + let agent: AgentExecutor; + + beforeAll(async () => { + mcpTools = await discoverMongoDBTools(); + }); + + beforeEach(() => { + testTools = new TestTools(mcpTools); + const transformToolResult = model.transformToolResult.bind(model); + agent = getToolCallingAgent(model, testTools.langChainTools(transformToolResult)); + }); + + eachTest("$prompt", async function (testConfig) { + testTools.mockTools(testConfig.mockedTools); + const conversation = await agent.invoke({ input: testConfig.prompt }); + console.log("conversation", conversation); + const toolCalls = testTools.getToolCalls(); + console.log("?????? toolCalls", toolCalls); + console.log("???? expected", testConfig.expectedToolCalls); + const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + + expect(toolCallingAccuracy).not.toEqual(0); + expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + }); + }); +} diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts new file mode 100644 index 00000000..d370633f --- /dev/null +++ b/tests/accuracy/sdk/models.ts @@ -0,0 +1,62 @@ +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; +import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; +import { ChatOllama } from "@langchain/ollama"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; + +type ToolResultForOllama = string; +export type AcceptableToolResponse = CallToolResult | ToolResultForOllama; + +export interface Model { + isAvailable(): boolean; + getLangChainModel(): M; + transformToolResult(callToolResult: CallToolResult): T; +} + +export class GeminiModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getLangChainModel(): ChatGoogleGenerativeAI { + return new ChatGoogleGenerativeAI({ + model: this.modelName, + apiKey: process.env.MDB_GEMINI_API_KEY, + }); + } + + transformToolResult(callToolResult: CallToolResult) { + return callToolResult; + } +} + +export class OllamaModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getLangChainModel(): ChatOllama { + return new ChatOllama({ + model: this.modelName, + }); + } + + transformToolResult(callToolResult: CallToolResult): ToolResultForOllama { + return JSON.stringify(callToolResult); + } +} + +const ALL_TESTABLE_MODELS = [ + // new GeminiModel("gemini-1.5-flash"), + // new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:latest"), +]; + +export type TestableModels = ReturnType; + +export function getAvailableModels() { + return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable()); +} diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts new file mode 100644 index 00000000..82719454 --- /dev/null +++ b/tests/accuracy/sdk/test-tools.ts @@ -0,0 +1,153 @@ +import { jest } from "@jest/globals"; +import { v4 as uuid } from "uuid"; +import { DynamicTool, tool as langChainTool } from "@langchain/core/tools"; +import { Client } from "@modelcontextprotocol/sdk/client/index.js"; +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; + +import { InMemoryTransport } from "../../integration/inMemoryTransport.js"; +import { defaultTestConfig } from "../../integration/helpers.js"; +import { Session } from "../../../src/session.js"; +import { Telemetry } from "../../../src/telemetry/telemetry.js"; +import { Server } from "../../../src/server.js"; +import { AcceptableToolResponse } from "./models.js"; +import { ToolCall } from "./accuracy-scorers.js"; + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; +type MockedToolResultGeneratorFn = jest.MockedFunction; +type MockedTools = Record; +export type ToolResultGenerators = Record; +export type LangChainTool = DynamicTool; +export type ToolResultTransformer = (toolResult: CallToolResult) => T; + +export class TestTools { + private mockedTools: MockedTools = {}; + private recordedToolCalls: ToolCall[] = []; + + constructor(private readonly mcpTools: Tool[]) { + for (const mcpTool of mcpTools) { + this.mockedTools[mcpTool.name] = jest.fn().mockReturnValue({ + content: [ + { + type: "text", + text: `Mock implementation for tool - ${mcpTool.name} not present`, + }, + ], + isError: true, + }); + } + } + + getToolCalls() { + return this.recordedToolCalls; + } + + mockTools(toolResultGenerators: ToolResultGenerators) { + for (const toolName in toolResultGenerators) { + const toolResultGeneratorFn = toolResultGenerators[toolName]; + if (!this.mockedTools[toolName]) { + throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); + } + + if (!toolResultGeneratorFn) { + // Are you happy TS? + continue; + } + this.mockedTools[toolName] = jest.fn(toolResultGeneratorFn); + } + } + + langChainTools( + transformToolResult: ToolResultTransformer + ): LangChainTool[] { + return this.mcpTools.map((mcpTool) => { + return langChainTool((...args) => { + console.log("????? args", args); + const [parameters, { runName, runId }] = args; + const toolCallId = typeof runId !== "undefined" ? `${runId}` : uuid(); + return this.langChainToolResultGenerator(`${runName}`, parameters, toolCallId, transformToolResult); + }, mcpTool); + }); + } + + private langChainToolResultGenerator( + tool: string, + parameters: unknown, + toolCallId: string, + transformToolResult: ToolResultTransformer + ): T { + this.recordedToolCalls.push({ + toolCallId: toolCallId, + toolName: tool, + parameters, + }); + const mockedToolResultGenerator = this.mockedTools[tool]; + if (!mockedToolResultGenerator) { + // log as well + return transformToolResult({ + content: [ + { + type: "text", + text: `Could not resolve tool generator for ${tool}`, + }, + ], + isError: true, + }); + } + + return transformToolResult(mockedToolResultGenerator(parameters)); + } +} + +export async function discoverMongoDBTools(): Promise { + let mcpClient: Client | undefined; + let mcpServer: Server | undefined; + try { + const serverTransport = new InMemoryTransport(); + const clientTransport = new InMemoryTransport(); + + await serverTransport.start(); + await clientTransport.start(); + + void serverTransport.output.pipeTo(clientTransport.input); + void clientTransport.output.pipeTo(serverTransport.input); + + const session = new Session({ + apiBaseUrl: defaultTestConfig.apiBaseUrl, + }); + + const telemetry = Telemetry.create(session, defaultTestConfig); + + mcpClient = new Client( + { + name: "tool-discovery-client", + version: "0.0.0", + }, + { + capabilities: {}, + } + ); + + mcpServer = new Server({ + session, + userConfig: defaultTestConfig, + telemetry, + mcpServer: new McpServer({ + name: "test-server", + version: "5.2.3", + }), + }); + + await mcpServer.connect(serverTransport); + await mcpClient.connect(clientTransport); + + return (await mcpClient.listTools()).tools; + } catch (error: unknown) { + console.error("Unexpected error occured", error); + return []; + } finally { + await mcpClient?.close(); + await mcpServer?.session?.close(); + await mcpServer?.close(); + } +} diff --git a/tests/accuracy/sdk/tool-calling-agent.ts b/tests/accuracy/sdk/tool-calling-agent.ts new file mode 100644 index 00000000..b9adedf5 --- /dev/null +++ b/tests/accuracy/sdk/tool-calling-agent.ts @@ -0,0 +1,36 @@ +import { ChatPromptTemplate } from "@langchain/core/prompts"; +import { createToolCallingAgent, AgentExecutor } from "langchain/agents"; + +import { LangChainTool } from "./test-tools.js"; +import { AcceptableToolResponse, Model } from "./models.js"; +import { BaseChatModel } from "@langchain/core/language_models/chat_models"; + +const prompt = ChatPromptTemplate.fromMessages([ + [ + "system", + [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119 (https://www.ietf.org/rfc/rfc2119.txt)', + "You are an expect AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple steps, you MUST call the necessary tools in sequence", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', + ].join("\n"), + ], + ["human", "{input}"], + ["placeholder", "{agent_scratchpad}"], +]); + +export function getToolCallingAgent( + model: Model, + tools: LangChainTool[] +) { + const llm = model.getLangChainModel(); + const agent = createToolCallingAgent({ + llm, + tools, + prompt, + }); + const agentExecutor = new AgentExecutor({ agent, tools }); + return agentExecutor; +} From af67d6c377c1cccb4882bbd6a4bd17ed865477d5 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 11:06:38 +0200 Subject: [PATCH 02/47] chore: use vercel AI SDK instead of langchain LangChain's ToolCalling agent was not providing a structured tool call response and different model providers were providing entirely different tool calls for the same tool definition which was too turbulent for us to have any accuracy baseline at all. Vercel's AI SDK pushes us forward on that problem and the tool call responses so far have always been well structured. This commit replaces LangChain based implementation with Vercel's AI SDK based implementation. --- package-lock.json | 594 ++++++++---------- tests/accuracy/sdk/agent.ts | 38 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 14 +- tests/accuracy/sdk/models.ts | 46 +- tests/accuracy/sdk/test-tools.ts | 106 ++-- tests/accuracy/sdk/tool-calling-agent.ts | 36 -- 6 files changed, 381 insertions(+), 453 deletions(-) create mode 100644 tests/accuracy/sdk/agent.ts delete mode 100644 tests/accuracy/sdk/tool-calling-agent.ts diff --git a/package-lock.json b/package-lock.json index 6e67a1f9..10b8977c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -60,6 +60,83 @@ "node": ">=20.10.0" } }, + "@himanshusinghs/ai-sdk-google": { + "extraneous": true + }, + "node_modules/@ai-sdk/provider": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", + "integrity": "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/provider-utils": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz", + "integrity": "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/react": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.12.tgz", + "integrity": "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/ui-utils": "1.2.11", + "swr": "^2.2.5", + "throttleit": "2.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@ai-sdk/ui-utils": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz", + "integrity": "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "zod-to-json-schema": "^3.24.1" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", @@ -1267,13 +1344,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@cfworker/json-schema": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/@cfworker/json-schema/-/json-schema-4.1.1.tgz", - "integrity": "sha512-gAmrUZSGtKc3AiBL71iNWxDsyUC5uMaKKGdvzYsBoTW/xi42JQHl7eKV2OYzCUqvc+D2RCcf7EXY2iCyFIk6og==", - "dev": true, - "license": "MIT" - }, "node_modules/@cspotcode/source-map-support": { "version": "0.8.1", "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", @@ -2050,16 +2120,6 @@ "dev": true, "license": "MIT" }, - "node_modules/@google/generative-ai": { - "version": "0.24.1", - "resolved": "https://registry.npmjs.org/@google/generative-ai/-/generative-ai-0.24.1.tgz", - "integrity": "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=18.0.0" - } - }, "node_modules/@hapi/boom": { "version": "10.0.1", "resolved": "https://registry.npmjs.org/@hapi/boom/-/boom-10.0.1.tgz", @@ -2107,6 +2167,54 @@ "@hapi/hoek": "^11.0.2" } }, + "node_modules/@himanshusinghs/google": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@himanshusinghs/google/-/google-1.2.11.tgz", + "integrity": "sha512-SKTFxwN9PpUHVrppFod8sF1jqys5azzsgcBVrSbc7VaazmVEnBxHQlv5/yfeZFjD3ly5Mw+AJdFfC0bxwdWBNg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "@ai-sdk/provider-utils": "2.2.6" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.2.tgz", + "integrity": "sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.6.tgz", + "integrity": "sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@humanfs/core": { "version": "0.19.1", "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", @@ -2946,152 +3054,6 @@ "jsep": "^0.4.0||^1.0.0" } }, - "node_modules/@langchain/core": { - "version": "0.3.61", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.3.61.tgz", - "integrity": "sha512-4O7fw5SXNSE+uBnathLQrhm3t+7dZGagt/5kt37A+pXw0AkudxEBvveg73sSnpBd9SIz3/Vc7F4k8rCKXGbEDA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@cfworker/json-schema": "^4.0.2", - "ansi-styles": "^5.0.0", - "camelcase": "6", - "decamelize": "1.2.0", - "js-tiktoken": "^1.0.12", - "langsmith": "^0.3.33", - "mustache": "^4.2.0", - "p-queue": "^6.6.2", - "p-retry": "4", - "uuid": "^10.0.0", - "zod": "^3.25.32", - "zod-to-json-schema": "^3.22.3" - }, - "engines": { - "node": ">=18" - } - }, - "node_modules/@langchain/core/node_modules/ansi-styles": { - "version": "5.2.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", - "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" - } - }, - "node_modules/@langchain/core/node_modules/camelcase": { - "version": "6.3.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", - "integrity": "sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/@langchain/core/node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "dev": true, - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/@langchain/google-genai": { - "version": "0.2.14", - "resolved": "https://registry.npmjs.org/@langchain/google-genai/-/google-genai-0.2.14.tgz", - "integrity": "sha512-gKe/T2LNh8wSSMJOaFmYd8cwQnDSXKtVtC6a7CFoq5nWuh0bKzhItM/7bue1aMN8mlKfB2G1HCwxhaZoSpS/DA==", - "dev": true, - "license": "MIT", - "dependencies": { - "@google/generative-ai": "^0.24.0", - "uuid": "^11.1.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/ollama": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/@langchain/ollama/-/ollama-0.2.3.tgz", - "integrity": "sha512-1Obe45jgQspqLMBVlayQbGdywFmri8DgmGRdzNu0li56cG5RReYlRCFVDZBRMMvF9JhsP5eXRyfyivtKfITHWQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "ollama": "^0.5.12", - "uuid": "^10.0.0" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/ollama/node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "dev": true, - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "license": "MIT", - "bin": { - "uuid": "dist/bin/uuid" - } - }, - "node_modules/@langchain/openai": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.5.16.tgz", - "integrity": "sha512-TqzPE3PM0bMkQi53qs8vCFkwaEp3VgwGw+s1e8Nas5ICCZZtc2XqcDPz4hf2gpo1k7/AZd6HuPlAsDy6wye9Qw==", - "dev": true, - "license": "MIT", - "dependencies": { - "js-tiktoken": "^1.0.12", - "openai": "^5.3.0", - "zod": "^3.25.32" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.3.58 <0.4.0" - } - }, - "node_modules/@langchain/textsplitters": { - "version": "0.1.0", - "resolved": "https://registry.npmjs.org/@langchain/textsplitters/-/textsplitters-0.1.0.tgz", - "integrity": "sha512-djI4uw9rlkAb5iMhtLED+xJebDdAG935AdP4eRTB02R7OB/act55Bj9wsskhZsvuyQRpO4O1wQOp85s6T6GWmw==", - "dev": true, - "license": "MIT", - "dependencies": { - "js-tiktoken": "^1.0.12" - }, - "engines": { - "node": ">=18" - }, - "peerDependencies": { - "@langchain/core": ">=0.2.21 <0.4.0" - } - }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.0", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.0.tgz", @@ -6082,6 +6044,13 @@ "@babel/types": "^7.20.7" } }, + "node_modules/@types/diff-match-patch": { + "version": "1.0.36", + "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz", + "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/estree": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", @@ -6192,13 +6161,6 @@ "undici-types": "~7.8.0" } }, - "node_modules/@types/retry": { - "version": "0.12.0", - "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.0.tgz", - "integrity": "sha512-wWKOClTTiizcZhXnPY4wikVAwmdYHp8q6DmC+EJUzAMsycb7HB32Kh9RN4+0gExjmPmZSAQjgURXIGATPegAvA==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -6228,13 +6190,6 @@ "license": "MIT", "optional": true }, - "node_modules/@types/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-7gqG38EyHgyP1S+7+xomFtL+ZNHcKv6DwNaCZmJmo1vgMugyF3TCnXVg4t1uk89mLNwnLtnY3TpOpCOyp1/xHQ==", - "dev": true, - "license": "MIT" - }, "node_modules/@types/webidl-conversions": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/@types/webidl-conversions/-/webidl-conversions-7.0.3.tgz", @@ -6850,6 +6805,33 @@ "node": ">= 14" } }, + "node_modules/ai": { + "version": "4.3.16", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.16.tgz", + "integrity": "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/react": "1.2.12", + "@ai-sdk/ui-utils": "1.2.11", + "@opentelemetry/api": "1.9.0", + "jsondiffpatch": "0.6.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + } + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -7957,16 +7939,6 @@ "node": ">=12" } }, - "node_modules/console-table-printer": { - "version": "2.14.6", - "resolved": "https://registry.npmjs.org/console-table-printer/-/console-table-printer-2.14.6.tgz", - "integrity": "sha512-MCBl5HNVaFuuHW6FGbL/4fB7N/ormCy+tQ+sxTrF6QtSbSNETvPuOVbkJBhzDgYhvjWGrTma4eYJa37ZuoQsPw==", - "dev": true, - "license": "MIT", - "dependencies": { - "simple-wcswidth": "^1.0.1" - } - }, "node_modules/content-disposition": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.0.tgz", @@ -8135,16 +8107,6 @@ } } }, - "node_modules/decamelize": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", - "integrity": "sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/decko": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz", @@ -8586,6 +8548,16 @@ "node": ">= 0.8" } }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/destroy": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", @@ -8633,6 +8605,13 @@ "node": ">=0.3.1" } }, + "node_modules/diff-match-patch": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz", + "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/diff-sequences": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", @@ -11940,16 +11919,6 @@ "node": ">=0.10.0" } }, - "node_modules/js-tiktoken": { - "version": "1.0.20", - "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.20.tgz", - "integrity": "sha512-Xlaqhhs8VfCd6Sh7a1cFkZHQbYTLCwVJJWiHVxBYzLPxW0XsoxBy1hitmjkdIjD3Aon5BXLHFwU5O8WUx6HH+A==", - "dev": true, - "license": "MIT", - "dependencies": { - "base64-js": "^1.5.1" - } - }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -12023,6 +11992,13 @@ "foreach": "^2.0.4" } }, + "node_modules/json-schema": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", + "dev": true, + "license": "(AFL-2.1 OR BSD-3-Clause)" + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -12050,6 +12026,37 @@ "node": ">=6" } }, + "node_modules/jsondiffpatch": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz", + "integrity": "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/diff-match-patch": "^1.0.36", + "chalk": "^5.3.0", + "diff-match-patch": "^1.0.5" + }, + "bin": { + "jsondiffpatch": "bin/jsondiffpatch.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + } + }, + "node_modules/jsondiffpatch/node_modules/chalk": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", + "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/jsonpath-plus": { "version": "10.3.0", "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", @@ -12873,16 +12880,6 @@ "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, - "node_modules/mustache": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/mustache/-/mustache-4.2.0.tgz", - "integrity": "sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==", - "dev": true, - "license": "MIT", - "bin": { - "mustache": "bin/mustache" - } - }, "node_modules/nan": { "version": "2.22.2", "resolved": "https://registry.npmjs.org/nan/-/nan-2.22.2.tgz", @@ -13241,14 +13238,27 @@ "node": "^10.13.0 || >=12.0.0" } }, - "node_modules/ollama": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.16.tgz", - "integrity": "sha512-OEbxxOIUZtdZgOaTPAULo051F5y+Z1vosxEYOoABPnQKeW7i4O8tJNlxCB+xioyoorVqgjkdj+TA1f1Hy2ug/w==", + "node_modules/ollama-ai-provider": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/ollama-ai-provider/-/ollama-ai-provider-1.2.0.tgz", + "integrity": "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "whatwg-fetch": "^3.6.20" + "@ai-sdk/provider": "^1.0.0", + "@ai-sdk/provider-utils": "^2.0.0", + "partial-json": "0.1.7" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } } }, "node_modules/on-finished": { @@ -13305,28 +13315,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/openai": { - "version": "5.8.2", - "resolved": "https://registry.npmjs.org/openai/-/openai-5.8.2.tgz", - "integrity": "sha512-8C+nzoHYgyYOXhHGN6r0fcb4SznuEn1R7YZMvlqDbnCuE0FM2mm3T1HiYW6WIcMS/F1Of2up/cSPjLPaWt0X9Q==", - "dev": true, - "license": "Apache-2.0", - "bin": { - "openai": "bin/cli" - }, - "peerDependencies": { - "ws": "^8.18.0", - "zod": "^3.23.8" - }, - "peerDependenciesMeta": { - "ws": { - "optional": true - }, - "zod": { - "optional": true - } - } - }, "node_modules/openapi-fetch": { "version": "0.14.0", "resolved": "https://registry.npmjs.org/openapi-fetch/-/openapi-fetch-0.14.0.tgz", @@ -13528,16 +13516,6 @@ "dev": true, "license": "MIT" }, - "node_modules/p-finally": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", - "integrity": "sha512-LICb2p9CB7FS+0eR1oqWnHhp0FljGLZCWBE9aix0Uye9W8LTQPwMTYVGWQWIw9RdQiDg4+epXQODwIYJtSJaow==", - "dev": true, - "license": "MIT", - "engines": { - "node": ">=4" - } - }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -13570,57 +13548,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/p-queue": { - "version": "6.6.2", - "resolved": "https://registry.npmjs.org/p-queue/-/p-queue-6.6.2.tgz", - "integrity": "sha512-RwFpb72c/BhQLEXIZ5K2e+AhgNVmIejGlTgiB9MzZ0e93GRvqZ7uSi0dvRF7/XIXDeNkra2fNHBxTyPDGySpjQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "eventemitter3": "^4.0.4", - "p-timeout": "^3.2.0" - }, - "engines": { - "node": ">=8" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" - } - }, - "node_modules/p-queue/node_modules/eventemitter3": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", - "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", - "dev": true, - "license": "MIT" - }, - "node_modules/p-retry": { - "version": "4.6.2", - "resolved": "https://registry.npmjs.org/p-retry/-/p-retry-4.6.2.tgz", - "integrity": "sha512-312Id396EbJdvRONlngUx0NydfrIQ5lsYu0znKVUzVvArzEIt08V1qhtyESbGVd1FGX7UKtiFp5uwKZdM8wIuQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/retry": "0.12.0", - "retry": "^0.13.1" - }, - "engines": { - "node": ">=8" - } - }, - "node_modules/p-timeout": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/p-timeout/-/p-timeout-3.2.0.tgz", - "integrity": "sha512-rhIwUycgwwKcP9yTOOFK/AKsAopjjCakVqLHePO3CC6Mir1Z99xT+R63jZxAT5lFZLa2inS5h+ZS2GvR99/FBg==", - "dev": true, - "license": "MIT", - "dependencies": { - "p-finally": "^1.0.0" - }, - "engines": { - "node": ">=8" - } - }, "node_modules/p-try": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.2.0.tgz", @@ -13711,6 +13638,13 @@ "node": ">= 0.8" } }, + "node_modules/partial-json": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz", + "integrity": "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==", + "dev": true, + "license": "MIT" + }, "node_modules/path-browserify": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", @@ -14725,6 +14659,13 @@ "loose-envify": "^1.1.0" } }, + "node_modules/secure-json-parse": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", + "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/seek-bzip": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/seek-bzip/-/seek-bzip-1.0.6.tgz", @@ -15165,13 +15106,6 @@ "joi": "^17.6.4" } }, - "node_modules/simple-wcswidth": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/simple-wcswidth/-/simple-wcswidth-1.1.2.tgz", - "integrity": "sha512-j7piyCjAeTDSjzTSQ7DokZtMNwNlEAyxqSZeCS+CXH7fJ4jx3FuJ/mTW3mE+6JLs4VJBbcll0Kjn+KXI5t21Iw==", - "dev": true, - "license": "MIT" - }, "node_modules/simple-websocket": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz", @@ -15681,6 +15615,20 @@ "node": ">= 6" } }, + "node_modules/swr": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.3.tgz", + "integrity": "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "dequal": "^2.0.3", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, "node_modules/synckit": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", @@ -15901,6 +15849,19 @@ "node": "*" } }, + "node_modules/throttleit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz", + "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -16565,13 +16526,6 @@ "node": ">=12" } }, - "node_modules/whatwg-fetch": { - "version": "3.6.20", - "resolved": "https://registry.npmjs.org/whatwg-fetch/-/whatwg-fetch-3.6.20.tgz", - "integrity": "sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg==", - "dev": true, - "license": "MIT" - }, "node_modules/whatwg-url": { "version": "14.2.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts new file mode 100644 index 00000000..905cfff9 --- /dev/null +++ b/tests/accuracy/sdk/agent.ts @@ -0,0 +1,38 @@ +import { generateText, Tool, Schema, LanguageModelV1 } from "ai"; +import { Model } from "./models.js"; + +const systemPrompt = [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119', + "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple steps, you MUST call the necessary tools in sequence", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', + "You SHOULD assume that you are already connected to a MongoDB connection", +].join("\n"); + +export interface Agent { + prompt(prompt: string, model: M, tools: T): Promise; +} + +export function getVercelToolCallingAgent(): Agent< + Model, + Record>>, + { text: string; messages: unknown[] } +> { + return { + async prompt(prompt: string, model: Model, tools: Record>>) { + const result = await generateText({ + model: model.getModel(), + system: systemPrompt, + prompt, + tools, + maxSteps: 100, + }); + return { + text: result.text, + messages: result.response.messages, + }; + }, + }; +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 0ec4bb64..97496f6e 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,14 +1,13 @@ -import { AgentExecutor } from "langchain/agents"; import { Tool } from "@modelcontextprotocol/sdk/types.js"; -import { discoverMongoDBTools, TestTools, ToolResultGenerators } from "./test-tools.js"; +import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; -import { getToolCallingAgent } from "./tool-calling-agent.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; +import { Agent, getVercelToolCallingAgent } from "./agent.js"; interface AccuracyTestConfig { prompt: string; expectedToolCalls: ExpectedToolCall[]; - mockedTools: ToolResultGenerators; + mockedTools: MockedTools; } export function describeAccuracyTests( @@ -22,7 +21,7 @@ export function describeAccuracyTests( eachModel(`$modelName - ${suiteName}`, function (model) { let mcpTools: Tool[]; let testTools: TestTools; - let agent: AgentExecutor; + let agent: Agent; beforeAll(async () => { mcpTools = await discoverMongoDBTools(); @@ -30,13 +29,12 @@ export function describeAccuracyTests( beforeEach(() => { testTools = new TestTools(mcpTools); - const transformToolResult = model.transformToolResult.bind(model); - agent = getToolCallingAgent(model, testTools.langChainTools(transformToolResult)); + agent = getVercelToolCallingAgent(); }); eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); - const conversation = await agent.invoke({ input: testConfig.prompt }); + const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); console.log("conversation", conversation); const toolCalls = testTools.getToolCalls(); console.log("?????? toolCalls", toolCalls); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index d370633f..832aad30 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,58 +1,42 @@ -import { BaseChatModel } from "@langchain/core/language_models/chat_models"; -import { ChatGoogleGenerativeAI } from "@langchain/google-genai"; -import { ChatOllama } from "@langchain/ollama"; -import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { LanguageModelV1 } from "ai"; +import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { ollama } from "ollama-ai-provider"; -type ToolResultForOllama = string; -export type AcceptableToolResponse = CallToolResult | ToolResultForOllama; - -export interface Model { +export interface Model

{ isAvailable(): boolean; - getLangChainModel(): M; - transformToolResult(callToolResult: CallToolResult): T; + getModel(): P; } -export class GeminiModel implements Model { +export class GeminiModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { return !!process.env.MDB_GEMINI_API_KEY; } - getLangChainModel(): ChatGoogleGenerativeAI { - return new ChatGoogleGenerativeAI({ - model: this.modelName, + getModel() { + return createGoogleGenerativeAI({ apiKey: process.env.MDB_GEMINI_API_KEY, - }); - } - - transformToolResult(callToolResult: CallToolResult) { - return callToolResult; + })(this.modelName); } } -export class OllamaModel implements Model { +export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return !!process.env.MDB_GEMINI_API_KEY; - } - - getLangChainModel(): ChatOllama { - return new ChatOllama({ - model: this.modelName, - }); + return true; } - transformToolResult(callToolResult: CallToolResult): ToolResultForOllama { - return JSON.stringify(callToolResult); + getModel() { + return ollama(this.modelName); } } const ALL_TESTABLE_MODELS = [ - // new GeminiModel("gemini-1.5-flash"), + new GeminiModel("gemini-1.5-flash"), // new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:latest"), + // new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index 82719454..cb728a36 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -1,6 +1,6 @@ -import { jest } from "@jest/globals"; +import { JSONSchema7 } from "json-schema"; import { v4 as uuid } from "uuid"; -import { DynamicTool, tool as langChainTool } from "@langchain/core/tools"; +import { Tool as VercelTool, Schema, tool as createVercelTool, jsonSchema } from "ai"; import { Client } from "@modelcontextprotocol/sdk/client/index.js"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; @@ -10,15 +10,22 @@ import { defaultTestConfig } from "../../integration/helpers.js"; import { Session } from "../../../src/session.js"; import { Telemetry } from "../../../src/telemetry/telemetry.js"; import { Server } from "../../../src/server.js"; -import { AcceptableToolResponse } from "./models.js"; import { ToolCall } from "./accuracy-scorers.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; -type MockedToolResultGeneratorFn = jest.MockedFunction; -type MockedTools = Record; -export type ToolResultGenerators = Record; -export type LangChainTool = DynamicTool; -export type ToolResultTransformer = (toolResult: CallToolResult) => T; +export type MockedTools = Record; + +function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { + return () => ({ + content: [ + { + type: "text", + text: `Mock implementation for tool not present`, + }, + ], + isError: true, + }); +} export class TestTools { private mockedTools: MockedTools = {}; @@ -26,15 +33,7 @@ export class TestTools { constructor(private readonly mcpTools: Tool[]) { for (const mcpTool of mcpTools) { - this.mockedTools[mcpTool.name] = jest.fn().mockReturnValue({ - content: [ - { - type: "text", - text: `Mock implementation for tool - ${mcpTool.name} not present`, - }, - ], - isError: true, - }); + this.mockedTools[mcpTool.name] = getDefaultToolResultGeneratorFn(); } } @@ -42,9 +41,9 @@ export class TestTools { return this.recordedToolCalls; } - mockTools(toolResultGenerators: ToolResultGenerators) { - for (const toolName in toolResultGenerators) { - const toolResultGeneratorFn = toolResultGenerators[toolName]; + mockTools(mockedTools: MockedTools) { + for (const toolName in mockedTools) { + const toolResultGeneratorFn = mockedTools[toolName]; if (!this.mockedTools[toolName]) { throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); } @@ -53,49 +52,40 @@ export class TestTools { // Are you happy TS? continue; } - this.mockedTools[toolName] = jest.fn(toolResultGeneratorFn); + this.mockedTools[toolName] = toolResultGeneratorFn; } } - langChainTools( - transformToolResult: ToolResultTransformer - ): LangChainTool[] { - return this.mcpTools.map((mcpTool) => { - return langChainTool((...args) => { - console.log("????? args", args); - const [parameters, { runName, runId }] = args; - const toolCallId = typeof runId !== "undefined" ? `${runId}` : uuid(); - return this.langChainToolResultGenerator(`${runName}`, parameters, toolCallId, transformToolResult); - }, mcpTool); - }); - } - - private langChainToolResultGenerator( - tool: string, - parameters: unknown, - toolCallId: string, - transformToolResult: ToolResultTransformer - ): T { - this.recordedToolCalls.push({ - toolCallId: toolCallId, - toolName: tool, - parameters, - }); - const mockedToolResultGenerator = this.mockedTools[tool]; - if (!mockedToolResultGenerator) { - // log as well - return transformToolResult({ - content: [ - { - type: "text", - text: `Could not resolve tool generator for ${tool}`, - }, - ], - isError: true, + vercelAiTools(): Record>> { + const vercelTools: Record>> = {}; + for (const tool of this.mcpTools) { + vercelTools[tool.name] = createVercelTool({ + description: tool.description, + parameters: jsonSchema(tool.inputSchema as JSONSchema7), + // eslint-disable-next-line @typescript-eslint/require-await + execute: async (args: unknown) => { + this.recordedToolCalls.push({ + toolCallId: uuid(), + toolName: tool.name, + parameters: args, + }); + const toolResultGeneratorFn = this.mockedTools[tool.name]; + if (!toolResultGeneratorFn) { + return { + content: [ + { + type: "text", + text: `Could not resolve tool generator for ${tool.name}`, + }, + ], + }; + } + + return toolResultGeneratorFn(args); + }, }); } - - return transformToolResult(mockedToolResultGenerator(parameters)); + return vercelTools; } } diff --git a/tests/accuracy/sdk/tool-calling-agent.ts b/tests/accuracy/sdk/tool-calling-agent.ts deleted file mode 100644 index b9adedf5..00000000 --- a/tests/accuracy/sdk/tool-calling-agent.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { ChatPromptTemplate } from "@langchain/core/prompts"; -import { createToolCallingAgent, AgentExecutor } from "langchain/agents"; - -import { LangChainTool } from "./test-tools.js"; -import { AcceptableToolResponse, Model } from "./models.js"; -import { BaseChatModel } from "@langchain/core/language_models/chat_models"; - -const prompt = ChatPromptTemplate.fromMessages([ - [ - "system", - [ - 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119 (https://www.ietf.org/rfc/rfc2119.txt)', - "You are an expect AI assistant with access to a set of tools for MongoDB database operations.", - "You MUST use the most relevant tool to answer the user's request", - "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", - "If a task requires multiple steps, you MUST call the necessary tools in sequence", - 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', - ].join("\n"), - ], - ["human", "{input}"], - ["placeholder", "{agent_scratchpad}"], -]); - -export function getToolCallingAgent( - model: Model, - tools: LangChainTool[] -) { - const llm = model.getLangChainModel(); - const agent = createToolCallingAgent({ - llm, - tools, - prompt, - }); - const agentExecutor = new AgentExecutor({ agent, tools }); - return agentExecutor; -} From dffeabf7b3de268d92d8751955af0a03a02d7540 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:01:57 +0200 Subject: [PATCH 03/47] chore: integrate capturing accuracy snapshots --- package.json | 3 +- tests/accuracy/sdk/accuracy-snapshot.ts | 54 +++++++++++++++++++ tests/accuracy/sdk/describe-accuracy-tests.ts | 40 ++++++++++++-- tests/accuracy/sdk/models.ts | 7 +-- tests/accuracy/sdk/test-tools.ts | 3 -- 5 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot.ts diff --git a/package.json b/package.json index 53d6d2c6..448310a1 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,8 @@ "check:types": "tsc --noEmit --project tsconfig.json", "reformat": "prettier --write .", "generate": "./scripts/generate.sh", - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", + "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy" }, "license": "Apache-2.0", "devDependencies": { diff --git a/tests/accuracy/sdk/accuracy-snapshot.ts b/tests/accuracy/sdk/accuracy-snapshot.ts new file mode 100644 index 00000000..1f7867a9 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot.ts @@ -0,0 +1,54 @@ +import fs from "fs/promises"; +import path from "path"; +import { z } from "zod"; + +export const SNAPSHOT_FILE_PATH = path.resolve(process.cwd(), "accuracy-snapshot.json"); + +export const AccuracySnapshotEntrySchema = z.object({ + datetime: z.string(), + commit: z.string(), + model: z.string(), + suite: z.string(), + test: z.string(), + toolCallingAccuracy: z.number(), + parameterAccuracy: z.number(), +}); + +export type AccuracySnapshotEntry = z.infer; + +export async function readSnapshot(): Promise { + try { + const raw = await fs.readFile(SNAPSHOT_FILE_PATH, "utf8"); + return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); + } catch (e: unknown) { + if ((e as { code: string }).code === "ENOENT") { + return []; + } + throw e; + } +} + +function waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { + AccuracySnapshotEntrySchema.parse(entry); + + for (let attempt = 0; attempt < 5; attempt++) { + try { + const snapshot = await readSnapshot(); + snapshot.unshift(entry); + const tmp = `${SNAPSHOT_FILE_PATH}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); + await fs.rename(tmp, SNAPSHOT_FILE_PATH); + return; + } catch (e) { + if (attempt < 4) { + await waitFor(100 + Math.random() * 200); + } else { + throw e; + } + } + } +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 97496f6e..a3ad0668 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -3,6 +3,7 @@ import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; import { Agent, getVercelToolCallingAgent } from "./agent.js"; +import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; interface AccuracyTestConfig { prompt: string; @@ -15,6 +16,20 @@ export function describeAccuracyTests( models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[] ) { + const accuracyDatetime = process.env.ACCURACY_DATETIME; + if (!accuracyDatetime) { + throw new Error("ACCURACY_DATETIME environment variable is not set"); + } + const accuracyCommit = process.env.ACCURACY_COMMIT; + if (!accuracyCommit) { + throw new Error("ACCURACY_COMMIT environment variable is not set"); + } + + if (!models.length) { + console.warn(`No models available to test ${suiteName}`); + return; + } + const eachModel = describe.each(models); const eachTest = it.each(accuracyTestConfigs); @@ -35,15 +50,30 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); - console.log("conversation", conversation); const toolCalls = testTools.getToolCalls(); - console.log("?????? toolCalls", toolCalls); - console.log("???? expected", testConfig.expectedToolCalls); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + await appendAccuracySnapshot({ + datetime: accuracyDatetime, + commit: accuracyCommit, + model: model.modelName, + suite: suiteName, + test: testConfig.prompt, + toolCallingAccuracy, + parameterAccuracy: parameterMatchingAccuracy, + }); - expect(toolCallingAccuracy).not.toEqual(0); - expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + try { + expect(toolCallingAccuracy).not.toEqual(0); + expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); + } catch (error) { + console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); + console.warn(`Conversation`, JSON.stringify(conversation, null, 2)); + console.warn(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.warn(`Tool calling accuracy`, toolCallingAccuracy); + console.warn(`Parameter matching accuracy`, parameterMatchingAccuracy); + throw error; + } }); }); } diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 832aad30..27b8e972 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -3,6 +3,7 @@ import { createGoogleGenerativeAI } from "@himanshusinghs/google"; import { ollama } from "ollama-ai-provider"; export interface Model

{ + readonly modelName: string; isAvailable(): boolean; getModel(): P; } @@ -25,7 +26,7 @@ export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return true; + return false; } getModel() { @@ -35,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - // new GeminiModel("gemini-2.0-flash"), - // new OllamaModel("qwen3:latest"), + new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index cb728a36..595a9069 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -132,9 +132,6 @@ export async function discoverMongoDBTools(): Promise { await mcpClient.connect(clientTransport); return (await mcpClient.listTools()).tools; - } catch (error: unknown) { - console.error("Unexpected error occured", error); - return []; } finally { await mcpClient?.close(); await mcpServer?.session?.close(); From 2e89f7a85469da8e0b6fa144bed5ffb6caacb3ee Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:03:50 +0200 Subject: [PATCH 04/47] chore: correct env names --- tests/accuracy/sdk/describe-accuracy-tests.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index a3ad0668..5d500ffa 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -16,13 +16,13 @@ export function describeAccuracyTests( models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[] ) { - const accuracyDatetime = process.env.ACCURACY_DATETIME; + const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; if (!accuracyDatetime) { - throw new Error("ACCURACY_DATETIME environment variable is not set"); + throw new Error("MDB_ACCURACY_DATETIME environment variable is not set"); } - const accuracyCommit = process.env.ACCURACY_COMMIT; + const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; if (!accuracyCommit) { - throw new Error("ACCURACY_COMMIT environment variable is not set"); + throw new Error("MDB_ACCURACY_COMMIT environment variable is not set"); } if (!models.length) { From 2345c273ed0fdc2d90ed70ffd92d74e68f880ca8 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 13:29:32 +0200 Subject: [PATCH 05/47] chore: more consolidated prompt tests --- tests/accuracy/list-databases.test.ts | 28 +++++++++---- tests/accuracy/sdk/agent.ts | 13 +++--- tests/accuracy/sdk/describe-accuracy-tests.ts | 41 ++++++++++--------- tests/accuracy/sdk/models.ts | 4 +- 4 files changed, 48 insertions(+), 38 deletions(-) diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index ae3f6c7d..d26fbc4e 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,16 +1,22 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -describeAccuracyTests("list-databases", getAvailableModels(), [ - { - prompt: "Assume that you're already connected. How many collections are there in sample_mflix database", +function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { + return { + systemPrompt: "Assume that you're already connected.", + prompt: prompt, mockedTools: { - "list-collections": function listCollections() { + "list-databases": function listDatabases() { return { content: [ { type: "text", - text: "Name: coll1", + text: "Name: db1", + }, + { + type: "text", + text: "Name: db2", }, ], }; @@ -18,9 +24,15 @@ describeAccuracyTests("list-databases", getAvailableModels(), [ }, expectedToolCalls: [ { - toolName: "list-collections", - parameters: { database: "sample_mflix" }, + toolName: "list-databases", + parameters: {}, }, ], - }, + }; +} + +describeAccuracyTests("list-databases", getAvailableModels(), [ + describeListDatabasesAccuracyTests("How many databases do I have?"), + describeListDatabasesAccuracyTests("List all the databases in my cluster."), + describeListDatabasesAccuracyTests("Is there a sample_mflix database in my cluster?"), ]); diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 905cfff9..eb680358 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -8,23 +8,20 @@ const systemPrompt = [ "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", "If a task requires multiple steps, you MUST call the necessary tools in sequence", 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', - "You SHOULD assume that you are already connected to a MongoDB connection", -].join("\n"); +]; export interface Agent { prompt(prompt: string, model: M, tools: T): Promise; } -export function getVercelToolCallingAgent(): Agent< - Model, - Record>>, - { text: string; messages: unknown[] } -> { +export function getVercelToolCallingAgent( + requestedSystemPrompt?: string +): Agent, Record>>, { text: string; messages: unknown[] }> { return { async prompt(prompt: string, model: Model, tools: Record>>) { const result = await generateText({ model: model.getModel(), - system: systemPrompt, + system: [...systemPrompt, requestedSystemPrompt].join("\n"), prompt, tools, maxSteps: 100, diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 5d500ffa..bf99d509 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -5,7 +5,8 @@ import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyS import { Agent, getVercelToolCallingAgent } from "./agent.js"; import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; -interface AccuracyTestConfig { +export interface AccuracyTestConfig { + systemPrompt?: string; prompt: string; expectedToolCalls: ExpectedToolCall[]; mockedTools: MockedTools; @@ -17,13 +18,7 @@ export function describeAccuracyTests( accuracyTestConfigs: AccuracyTestConfig[] ) { const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; - if (!accuracyDatetime) { - throw new Error("MDB_ACCURACY_DATETIME environment variable is not set"); - } const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; - if (!accuracyCommit) { - throw new Error("MDB_ACCURACY_COMMIT environment variable is not set"); - } if (!models.length) { console.warn(`No models available to test ${suiteName}`); @@ -53,25 +48,31 @@ export function describeAccuracyTests( const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - await appendAccuracySnapshot({ - datetime: accuracyDatetime, - commit: accuracyCommit, - model: model.modelName, - suite: suiteName, - test: testConfig.prompt, - toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, - }); + if (accuracyDatetime && accuracyCommit) { + await appendAccuracySnapshot({ + datetime: accuracyDatetime, + commit: accuracyCommit, + model: model.modelName, + suite: suiteName, + test: testConfig.prompt, + toolCallingAccuracy, + parameterAccuracy: parameterMatchingAccuracy, + }); + } else { + console.info( + `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` + ); + } try { expect(toolCallingAccuracy).not.toEqual(0); expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); } catch (error) { console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); - console.warn(`Conversation`, JSON.stringify(conversation, null, 2)); - console.warn(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.warn(`Tool calling accuracy`, toolCallingAccuracy); - console.warn(`Parameter matching accuracy`, parameterMatchingAccuracy); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug(`Tool calling accuracy`, toolCallingAccuracy); + console.debug(`Parameter matching accuracy`, parameterMatchingAccuracy); throw error; } }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 27b8e972..f6f8a879 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -36,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:latest"), + // new GeminiModel("gemini-2.0-flash"), + // new OllamaModel("qwen3:latest"), ]; export type TestableModels = ReturnType; From 0cdfe2e15b8c279ac2fe9146802eafd58c08a45d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 30 Jun 2025 16:37:10 +0200 Subject: [PATCH 06/47] chore: add a few more tests and some more models --- tests/accuracy/list-collections.test.ts | 38 +++++++++++++++++++++++++ tests/accuracy/sdk/models.ts | 6 ++-- 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 tests/accuracy/list-collections.test.ts diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts new file mode 100644 index 00000000..b871a96e --- /dev/null +++ b/tests/accuracy/list-collections.test.ts @@ -0,0 +1,38 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { + return { + systemPrompt: "Assume that you're already connected.", + prompt: prompt, + mockedTools: { + "list-collections": function listCollections() { + return { + content: [ + { + type: "text", + text: "Name: coll1", + }, + { + type: "text", + text: "Name: coll1", + }, + ], + }; + }, + }, + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "db1" }, + }, + ], + }; +} + +describeAccuracyTests("list-collections", getAvailableModels(), [ + describeListCollectionsAccuracyTests("How many collections do I have in database db1?"), + describeListCollectionsAccuracyTests("List all the collections in my MongoDB database db1."), + describeListCollectionsAccuracyTests("Is there a coll1 collection in my MongoDB database db1?"), +]); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index f6f8a879..e3f5ab1f 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -26,7 +26,7 @@ export class OllamaModel implements Model { constructor(readonly modelName: string) {} isAvailable(): boolean { - return false; + return true; } getModel() { @@ -36,8 +36,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-1.5-flash"), - // new GeminiModel("gemini-2.0-flash"), - // new OllamaModel("qwen3:latest"), + new GeminiModel("gemini-2.0-flash"), + new OllamaModel("qwen3:1.7b"), ]; export type TestableModels = ReturnType; From 6e69fd61b868991f9208478e37e19e6f1233bad2 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 13:01:41 +0200 Subject: [PATCH 07/47] chore: add AzureOpenAI model in the model list --- package-lock.json | 35 +++++++++++++++++++ tests/accuracy/list-collections.test.ts | 2 +- tests/accuracy/list-databases.test.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 6 +++- tests/accuracy/sdk/models.ts | 21 +++++++++-- 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/package-lock.json b/package-lock.json index 10b8977c..cae14a31 100644 --- a/package-lock.json +++ b/package-lock.json @@ -63,6 +63,41 @@ "@himanshusinghs/ai-sdk-google": { "extraneous": true }, + "node_modules/@ai-sdk/azure": { + "version": "1.3.23", + "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", + "integrity": "sha512-vpsaPtU24RBVk/IMM5UylR/N4RtAuL2NZLWc7LJ3tvMTHu6pI46a7w+1qIwR3F6yO9ehWR8qvfLaBefJNFxaVw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai": "1.3.22", + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/openai": { + "version": "1.3.22", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.22.tgz", + "integrity": "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, "node_modules/@ai-sdk/provider": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index b871a96e..2bc11dea 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -4,7 +4,7 @@ import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { return { - systemPrompt: "Assume that you're already connected.", + injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-collections": function listCollections() { diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index d26fbc4e..cf06303e 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -4,7 +4,7 @@ import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { return { - systemPrompt: "Assume that you're already connected.", + injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-databases": function listDatabases() { diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index bf99d509..28fa3bd7 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -7,6 +7,7 @@ import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; export interface AccuracyTestConfig { systemPrompt?: string; + injectConnectedAssumption?: boolean; prompt: string; expectedToolCalls: ExpectedToolCall[]; mockedTools: MockedTools; @@ -44,7 +45,10 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); - const conversation = await agent.prompt(testConfig.prompt, model, testTools.vercelAiTools()); + const promptForModel = testConfig.injectConnectedAssumption + ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") + : testConfig.prompt; + const conversation = await agent.prompt(promptForModel, model, testTools.vercelAiTools()); const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index e3f5ab1f..c653c79c 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,5 +1,6 @@ import { LanguageModelV1 } from "ai"; import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { createAzure } from "@ai-sdk/azure"; import { ollama } from "ollama-ai-provider"; export interface Model

{ @@ -8,6 +9,22 @@ export interface Model

{ getModel(): P; } +export class OpenAIModel implements Model { + constructor(readonly modelName: string) {} + + isAvailable(): boolean { + return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; + } + + getModel() { + return createAzure({ + baseURL: process.env.MDB_AZURE_OPEN_AI_API_URL, + apiKey: process.env.MDB_AZURE_OPEN_AI_API_KEY, + apiVersion: "2024-12-01-preview", + })(this.modelName); + } +} + export class GeminiModel implements Model { constructor(readonly modelName: string) {} @@ -35,9 +52,9 @@ export class OllamaModel implements Model { } const ALL_TESTABLE_MODELS = [ - new GeminiModel("gemini-1.5-flash"), new GeminiModel("gemini-2.0-flash"), - new OllamaModel("qwen3:1.7b"), + new OpenAIModel("gpt-4o"), + // new OllamaModel("qwen3:1.7b"), ]; export type TestableModels = ReturnType; From ea099c26cdb7cc32782a24dac90118cc61d96c74 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:04:21 +0200 Subject: [PATCH 08/47] chore: use ListDatabasesTool response creator for tests --- src/tools/mongodb/metadata/listDatabases.ts | 23 +++++++++++---- tests/accuracy/list-databases.test.ts | 31 ++++++++++----------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/src/tools/mongodb/metadata/listDatabases.ts b/src/tools/mongodb/metadata/listDatabases.ts index fe324f07..1d1ae4d2 100644 --- a/src/tools/mongodb/metadata/listDatabases.ts +++ b/src/tools/mongodb/metadata/listDatabases.ts @@ -3,6 +3,17 @@ import { MongoDBToolBase } from "../mongodbTool.js"; import * as bson from "bson"; import { OperationType } from "../../tool.js"; +export function listDatabasesResponse(databases: { name: string; sizeOnDisk: string }[]): CallToolResult { + return { + content: databases.map((db) => { + return { + text: `Name: ${db.name}, Size: ${db.sizeOnDisk} bytes`, + type: "text", + }; + }), + }; +} + export class ListDatabasesTool extends MongoDBToolBase { protected name = "list-databases"; protected description = "List all databases for a MongoDB connection"; @@ -13,13 +24,13 @@ export class ListDatabasesTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const dbs = (await provider.listDatabases("")).databases as { name: string; sizeOnDisk: bson.Long }[]; - return { - content: dbs.map((db) => { + return listDatabasesResponse( + dbs.map((db) => { return { - text: `Name: ${db.name}, Size: ${db.sizeOnDisk.toString()} bytes`, - type: "text", + name: db.name, + sizeOnDisk: db.sizeOnDisk.toString(), }; - }), - }; + }) + ); } } diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index cf06303e..0a89db1d 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,25 +1,24 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; -function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig { +function callsListDatabases(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-databases": function listDatabases() { - return { - content: [ - { - type: "text", - text: "Name: db1", - }, - { - type: "text", - text: "Name: db2", - }, - ], - }; + return listDatabasesResponse([ + { + name: "db1", + sizeOnDisk: "1024", + }, + { + name: "db2", + sizeOnDisk: "2048", + }, + ]); }, }, expectedToolCalls: [ @@ -32,7 +31,7 @@ function describeListDatabasesAccuracyTests(prompt: string): AccuracyTestConfig } describeAccuracyTests("list-databases", getAvailableModels(), [ - describeListDatabasesAccuracyTests("How many databases do I have?"), - describeListDatabasesAccuracyTests("List all the databases in my cluster."), - describeListDatabasesAccuracyTests("Is there a sample_mflix database in my cluster?"), + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases in my cluster."), + callsListDatabases("Is there a sample_mflix database in my cluster?"), ]); From 8ae3d3d39eddda9dc7b779c9f65dbdffd3793d03 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:05:51 +0200 Subject: [PATCH 09/47] chore: use ListCollectionsTool response creators in tests --- src/tools/mongodb/metadata/listCollections.ts | 45 ++++++++------ tests/accuracy/list-collections.test.ts | 62 ++++++++++++++----- 2 files changed, 72 insertions(+), 35 deletions(-) diff --git a/src/tools/mongodb/metadata/listCollections.ts b/src/tools/mongodb/metadata/listCollections.ts index 193d0465..f676964f 100644 --- a/src/tools/mongodb/metadata/listCollections.ts +++ b/src/tools/mongodb/metadata/listCollections.ts @@ -2,6 +2,28 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function listCollectionsResponse(database: string, collections: string[]): CallToolResult { + if (collections.length === 0) { + return { + content: [ + { + type: "text", + text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, + }, + ], + }; + } + + return { + content: collections.map((collection) => { + return { + text: `Name: "${collection}"`, + type: "text", + }; + }), + }; +} + export class ListCollectionsTool extends MongoDBToolBase { protected name = "list-collections"; protected description = "List all collections for a given database"; @@ -15,24 +37,9 @@ export class ListCollectionsTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const collections = await provider.listCollections(database); - if (collections.length === 0) { - return { - content: [ - { - type: "text", - text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, - }, - ], - }; - } - - return { - content: collections.map((collection) => { - return { - text: `Name: "${collection.name}"`, - type: "text", - }; - }), - }; + return listCollectionsResponse( + database, + collections.map((collection) => `${collection.name}`) + ); } } diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index 2bc11dea..ac086859 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,25 +1,16 @@ import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { listCollectionsResponse } from "../../src/tools/mongodb/metadata/listCollections.js"; +import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; -function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfig { +function callsListCollections(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, mockedTools: { "list-collections": function listCollections() { - return { - content: [ - { - type: "text", - text: "Name: coll1", - }, - { - type: "text", - text: "Name: coll1", - }, - ], - }; + return listCollectionsResponse("db1", ["coll1", "coll2"]); }, }, expectedToolCalls: [ @@ -31,8 +22,47 @@ function describeListCollectionsAccuracyTests(prompt: string): AccuracyTestConfi }; } +function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "list-collections": function listCollections() { + return listCollectionsResponse("db1", ["coll1", "coll2"]); + }, + "list-databases": function listDatabases() { + return listDatabasesResponse([ + { + name: "db1", + sizeOnDisk: "1024", + }, + { + name: "db2", + sizeOnDisk: "2048", + }, + ]); + }, + }, + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { database: "db1" }, + }, + { + toolName: "list-collections", + parameters: { database: "db2" }, + }, + ], + }; +} + describeAccuracyTests("list-collections", getAvailableModels(), [ - describeListCollectionsAccuracyTests("How many collections do I have in database db1?"), - describeListCollectionsAccuracyTests("List all the collections in my MongoDB database db1."), - describeListCollectionsAccuracyTests("Is there a coll1 collection in my MongoDB database db1?"), + callsListCollections("How many collections do I have in database db1?"), + callsListCollections("List all the collections in my MongoDB database db1."), + callsListCollections("Is there a coll1 collection in my MongoDB database db1?"), + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), ]); From 1f5b24661524749df21aed32bb8f3abc1d171224 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 14:30:50 +0200 Subject: [PATCH 10/47] chore: tests for collection-indexes tool --- src/tools/mongodb/read/collectionIndexes.ts | 74 ++++++++++++++------- tests/accuracy/collection-indexes.test.ts | 42 ++++++++++++ 2 files changed, 93 insertions(+), 23 deletions(-) create mode 100644 tests/accuracy/collection-indexes.test.ts diff --git a/src/tools/mongodb/read/collectionIndexes.ts b/src/tools/mongodb/read/collectionIndexes.ts index cc0a141b..71ade728 100644 --- a/src/tools/mongodb/read/collectionIndexes.ts +++ b/src/tools/mongodb/read/collectionIndexes.ts @@ -2,6 +2,44 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function collectionIndexesResponse({ + database, + collection, + indexes = [], + namespaceNotFound, +}: { + database: string; + collection: string; + indexes?: { name: string; key: string }[]; + namespaceNotFound?: boolean; +}): CallToolResult { + if (namespaceNotFound) { + return { + content: [ + { + text: `The indexes for "${database}.${collection}" cannot be determined because the collection does not exist.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${indexes.length} indexes in the collection "${collection}":`, + type: "text", + }, + ...(indexes.map((indexDefinition) => { + return { + text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, + type: "text", + }; + }) as { text: string; type: "text" }[]), + ], + }; +} + export class CollectionIndexesTool extends MongoDBToolBase { protected name = "collection-indexes"; protected description = "Describe the indexes for a collection"; @@ -11,21 +49,14 @@ export class CollectionIndexesTool extends MongoDBToolBase { protected async execute({ database, collection }: ToolArgs): Promise { const provider = await this.ensureConnected(); const indexes = await provider.getIndexes(database, collection); - - return { - content: [ - { - text: `Found ${indexes.length} indexes in the collection "${collection}":`, - type: "text", - }, - ...(indexes.map((indexDefinition) => { - return { - text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, - type: "text", - }; - }) as { text: string; type: "text" }[]), - ], - }; + return collectionIndexesResponse({ + database, + collection, + indexes: indexes.map((index) => ({ + name: `${index.name}`, + key: JSON.stringify(index.key), + })), + }); } protected handleError( @@ -33,14 +64,11 @@ export class CollectionIndexesTool extends MongoDBToolBase { args: ToolArgs ): Promise | CallToolResult { if (error instanceof Error && "codeName" in error && error.codeName === "NamespaceNotFound") { - return { - content: [ - { - text: `The indexes for "${args.database}.${args.collection}" cannot be determined because the collection does not exist.`, - type: "text", - }, - ], - }; + return collectionIndexesResponse({ + database: args.database, + collection: args.collection, + namespaceNotFound: true, + }); } return super.handleError(error, args); diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts new file mode 100644 index 00000000..78b223e3 --- /dev/null +++ b/tests/accuracy/collection-indexes.test.ts @@ -0,0 +1,42 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { collectionIndexesResponse } from "../../src/tools/mongodb/read/collectionIndexes.js"; + +function callsCollectionIndexes(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-indexes": function collectionIndexes() { + return collectionIndexesResponse({ + database: "db1", + collection: "coll1", + indexes: [ + { + name: "year", + key: JSON.stringify({ _id: 1 }), + }, + ], + }); + }, + }, + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +describeAccuracyTests("collection-indexes", getAvailableModels(), [ + callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), + callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), + callsCollectionIndexes( + `Will this query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' be a collection scan?` + ), +]); From 330b9e57b3f96dda283bd3889f463a1b125edcb9 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:16:20 +0200 Subject: [PATCH 11/47] modify prompt for list-collections prompt and log tools provided --- tests/accuracy/collection-indexes.test.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index 78b223e3..683f386a 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -37,6 +37,6 @@ describeAccuracyTests("collection-indexes", getAvailableModels(), [ callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), callsCollectionIndexes( - `Will this query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' be a collection scan?` + `Is the following query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' indexed?` ), ]); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 28fa3bd7..972f10b3 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -45,10 +45,11 @@ export function describeAccuracyTests( eachTest("$prompt", async function (testConfig) { testTools.mockTools(testConfig.mockedTools); + const toolsForModel = testTools.vercelAiTools(); const promptForModel = testConfig.injectConnectedAssumption ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, testTools.vercelAiTools()); + const conversation = await agent.prompt(promptForModel, model, toolsForModel); const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); @@ -73,6 +74,7 @@ export function describeAccuracyTests( expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); } catch (error) { console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); + console.debug(`Provided tools`, JSON.stringify(toolsForModel, null, 2)); console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); console.debug(`Tool calling accuracy`, toolCallingAccuracy); From 127fee0f0d8c0e8f6507382baea1afa91e30eb24 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:32:06 +0200 Subject: [PATCH 12/47] chore: have mock generators return Promise of ToolResult as well --- tests/accuracy/sdk/test-tools.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts index 595a9069..15bb0420 100644 --- a/tests/accuracy/sdk/test-tools.ts +++ b/tests/accuracy/sdk/test-tools.ts @@ -12,7 +12,7 @@ import { Telemetry } from "../../../src/telemetry/telemetry.js"; import { Server } from "../../../src/server.js"; import { ToolCall } from "./accuracy-scorers.js"; -type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult; +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { @@ -81,7 +81,7 @@ export class TestTools { }; } - return toolResultGeneratorFn(args); + return await toolResultGeneratorFn(args); }, }); } From d8c79b8682e9abf67917f98443b9de823cbc20ac Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 16:32:24 +0200 Subject: [PATCH 13/47] chore: tests for collection-schema tool --- .../mongodb/metadata/collectionSchema.ts | 59 +++++++++++-------- tests/accuracy/collection-schema.test.ts | 47 +++++++++++++++ 2 files changed, 80 insertions(+), 26 deletions(-) create mode 100644 tests/accuracy/collection-schema.test.ts diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index f0145323..71ed5256 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -1,7 +1,38 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -import { getSimplifiedSchema } from "mongodb-schema"; +import { getSimplifiedSchema, SimplifiedSchema } from "mongodb-schema"; + +export function collectionSchemaResponse( + database: string, + collection: string, + schema: SimplifiedSchema +): CallToolResult { + const fieldsCount = Object.entries(schema).length; + if (fieldsCount === 0) { + return { + content: [ + { + text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, + type: "text", + }, + { + text: JSON.stringify(schema), + type: "text", + }, + ], + }; +} export class CollectionSchemaTool extends MongoDBToolBase { protected name = "collection-schema"; @@ -14,30 +45,6 @@ export class CollectionSchemaTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const documents = await provider.find(database, collection, {}, { limit: 5 }).toArray(); const schema = await getSimplifiedSchema(documents); - - const fieldsCount = Object.entries(schema).length; - if (fieldsCount === 0) { - return { - content: [ - { - text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, - type: "text", - }, - { - text: JSON.stringify(schema), - type: "text", - }, - ], - }; + return collectionSchemaResponse(database, collection, schema); } } diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts new file mode 100644 index 00000000..e72c65de --- /dev/null +++ b/tests/accuracy/collection-schema.test.ts @@ -0,0 +1,47 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; +import { getSimplifiedSchema } from "mongodb-schema"; + +function callsCollectionSchema(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async function collectionSchema() { + return collectionSchemaResponse( + "db1", + "coll1", + await getSimplifiedSchema([ + { + name: "Sample name1", + dob: "28.11.2001", + location: "NY", + }, + { + name: "Sample name1", + dob: "28.11.2001", + location: "NY", + title: "Dr.", + }, + ]) + ); + }, + }, + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +describeAccuracyTests("collection-schema", getAvailableModels(), [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), +]); From f430780a5aa1bd257e744dbceebc4f921e58fe9d Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 17:17:53 +0200 Subject: [PATCH 14/47] chore: do not fail tests on dropped accuracy --- tests/accuracy/sdk/describe-accuracy-tests.ts | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 972f10b3..c602bf96 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -53,6 +53,13 @@ export function describeAccuracyTests( const toolCalls = testTools.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug( + "Tool calling accuracy: %s, Parameter Accuracy: %s", + toolCallingAccuracy, + parameterMatchingAccuracy + ); if (accuracyDatetime && accuracyCommit) { await appendAccuracySnapshot({ datetime: accuracyDatetime, @@ -68,19 +75,6 @@ export function describeAccuracyTests( `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` ); } - - try { - expect(toolCallingAccuracy).not.toEqual(0); - expect(parameterMatchingAccuracy).toBeGreaterThanOrEqual(0.5); - } catch (error) { - console.warn(`Accuracy test failed for ${model.modelName} - ${suiteName} - ${testConfig.prompt}`); - console.debug(`Provided tools`, JSON.stringify(toolsForModel, null, 2)); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug(`Tool calling accuracy`, toolCallingAccuracy); - console.debug(`Parameter matching accuracy`, parameterMatchingAccuracy); - throw error; - } }); }); } From a09c725db157a976226d6d7b23d6f2f47dc04495 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 1 Jul 2025 17:17:58 +0200 Subject: [PATCH 15/47] chore: added tests for find tool --- src/tools/mongodb/read/find.ts | 36 ++++---- tests/accuracy/find.test.ts | 157 +++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 tests/accuracy/find.test.ts diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index 97c90e08..5e3fa4f4 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -13,7 +13,7 @@ export const FindArgs = { .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z .record(z.string(), z.unknown()) - .optional() + // .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z @@ -22,6 +22,23 @@ export const FindArgs = { .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), }; +export function findResponse(collection: string, documents: unknown[]): CallToolResult { + return { + content: [ + { + text: `Found ${documents.length} documents in the collection "${collection}":`, + type: "text", + }, + ...documents.map<{ type: "text"; text: string }>((doc) => { + return { + text: EJSON.stringify(doc), + type: "text", + }; + }), + ], + }; +} + export class FindTool extends MongoDBToolBase { protected name = "find"; protected description = "Run a find query against a MongoDB collection"; @@ -50,21 +67,6 @@ export class FindTool extends MongoDBToolBase { const documents = await provider.find(database, collection, filter, { projection, limit, sort }).toArray(); - const content: Array<{ text: string; type: "text" }> = [ - { - text: `Found ${documents.length} documents in the collection "${collection}":`, - type: "text", - }, - ...documents.map((doc) => { - return { - text: EJSON.stringify(doc), - type: "text", - } as { text: string; type: "text" }; - }), - ]; - - return { - content, - }; + return findResponse(collection, documents); } } diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts new file mode 100644 index 00000000..0144e22b --- /dev/null +++ b/tests/accuracy/find.test.ts @@ -0,0 +1,157 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { findResponse } from "../../src/tools/mongodb/read/find.js"; +import { MockedTools } from "./sdk/test-tools.js"; +import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; +import { getSimplifiedSchema } from "mongodb-schema"; + +const documents = [ + { + title: "book1", + author: "author1", + date_of_publish: "01.01.1990", + }, + { + title: "book2", + author: "author1", + date_of_publish: "01.01.1992", + }, + { + title: "book3", + author: "author2", + date_of_publish: "01.01.1990", + }, +]; + +function callsFindNoFilter(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", documents), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +function callsFindWithFilter(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => + findResponse( + "coll1", + documents.filter((doc) => doc.author === "author1") + ), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + filter: { author: "author1" }, + }, + }, + ], + }; +} + +function callsFindWithProjection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", documents), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + projection: { title: 1 }, + }, + }, + ], + }; +} + +function callsFindWithProjectionAndFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => + findResponse( + "coll1", + documents.filter((doc) => doc.date_of_publish === "01.01.1992") + ), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + filter: { date_of_publish: "01.01.1992" }, + projection: { title: 1 }, + }, + }, + ], + }; +} + +function callsFindWithSortAndLimit(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "collection-schema": async () => + collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), + find: () => findResponse("coll1", [documents[0], documents[1]]), + }, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "db1", + collection: "coll1", + sort: { date_of_publish: 1 }, + limit: 2, + }, + }, + ], + }; +} + +describeAccuracyTests("find", getAvailableModels(), [ + callsFindNoFilter("List all the documents in 'db1.coll1' namespace"), + callsFindNoFilter("Find all the documents from collection coll1 in database db1"), + callsFindWithFilter("Find all the books published by author name 'author1' in db1.coll1 namespace"), + callsFindWithFilter("Find all the documents in coll1 collection and db1 database where author is 'author1'"), + callsFindWithProjection("Give me all the title of the books available in 'db1.coll1' namespace"), + callsFindWithProjection("Give me all the title of the books published in available in 'db1.coll1' namespace"), + callsFindWithProjectionAndFilters( + "Find all the book titles from 'db1.coll1' namespace where date_of_publish is '01.01.1992'" + ), + callsFindWithSortAndLimit("List first two books sorted by the field date_of_publish in namespace db1.coll1"), +]); From 1aa80eb288ab1537db176c7858b96d3108b21965 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 15:00:13 +0200 Subject: [PATCH 16/47] chore: tests for insert-many tool --- src/tools/mongodb/create/insertMany.ts | 28 +++++----- tests/accuracy/insert-many.test.ts | 72 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 12 deletions(-) create mode 100644 tests/accuracy/insert-many.test.ts diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index f28d79d5..c92ee4c3 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,6 +3,21 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; +export function insertManyResponse(collection: string, insertedCount: number, insertedIds: unknown[]): CallToolResult { + return { + content: [ + { + text: `Inserted \`${insertedCount}\` document(s) into collection "${collection}"`, + type: "text", + }, + { + text: `Inserted IDs: ${insertedIds.join(", ")}`, + type: "text", + }, + ], + }; +} + export class InsertManyTool extends MongoDBToolBase { protected name = "insert-many"; protected description = "Insert an array of documents into a MongoDB collection"; @@ -24,17 +39,6 @@ export class InsertManyTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const result = await provider.insertMany(database, collection, documents); - return { - content: [ - { - text: `Inserted \`${result.insertedCount}\` document(s) into collection "${collection}"`, - type: "text", - }, - { - text: `Inserted IDs: ${Object.values(result.insertedIds).join(", ")}`, - type: "text", - }, - ], - }; + return insertManyResponse(collection, result.insertedCount, Object.values(result.insertedIds)); } } diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts new file mode 100644 index 00000000..b720ac1c --- /dev/null +++ b/tests/accuracy/insert-many.test.ts @@ -0,0 +1,72 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { insertManyResponse } from "../../src/tools/mongodb/create/insertMany.js"; + +function callsInsertMany(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "insert-many": function listDatabases() { + return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); + }, + }, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "db1", + collection: "coll1", + documents: [ + { + id: 1, + name: "name1", + }, + { + id: 2, + name: "name2", + }, + { + id: 3, + name: "name3", + }, + ], + }, + }, + ], + }; +} + +function callsEmptyInsertMany(prompt: string) { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "insert-many": function listDatabases() { + return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); + }, + }, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "db1", + collection: "coll1", + documents: [{}, {}, {}], + }, + }, + ], + }; +} + +describeAccuracyTests("insert-many", getAvailableModels(), [ + callsInsertMany( + [ + "In my namespace 'db1.coll1', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'coll1' inside database 'db1'"), +]); From b0c3df6808733284008953606f1c11fc43275ea2 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 15:06:30 +0200 Subject: [PATCH 17/47] chore: tests for delete-many tool --- src/tools/mongodb/delete/deleteMany.ts | 20 ++++++---- tests/accuracy/delete-many.test.ts | 53 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 tests/accuracy/delete-many.test.ts diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index 0257d167..4bc8eba0 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -4,6 +4,17 @@ import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; +export function deleteManyResponse(collection: string, delectedCount: number): CallToolResult { + return { + content: [ + { + text: `Deleted \`${delectedCount}\` document(s) from collection "${collection}"`, + type: "text", + }, + ], + }; +} + export class DeleteManyTool extends MongoDBToolBase { protected name = "delete-many"; protected description = "Removes all documents that match the filter from a MongoDB collection"; @@ -45,13 +56,6 @@ export class DeleteManyTool extends MongoDBToolBase { const result = await provider.deleteMany(database, collection, filter); - return { - content: [ - { - text: `Deleted \`${result.deletedCount}\` document(s) from collection "${collection}"`, - type: "text", - }, - ], - }; + return deleteManyResponse(collection, result.deletedCount); } } diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts new file mode 100644 index 00000000..ddda1d50 --- /dev/null +++ b/tests/accuracy/delete-many.test.ts @@ -0,0 +1,53 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; + +function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "delete-many": function listDatabases() { + return deleteManyResponse("coll1", 10); + }, + }, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: { + "delete-many": function listDatabases() { + return deleteManyResponse("coll1", 10); + }, + }, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "db1", + collection: "coll1", + filters: { provider: "BongoDB" }, + }, + }, + ], + }; +} + +describeAccuracyTests("delete-many", getAvailableModels(), [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'db1.coll1' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'coll1' in database 'db1'"), + callsDeleteManyWithFilters("Remove all the documents from namespace 'db1.coll1' where provider is 'BongoDB'"), +]); From c5365ac7e29c0e855e9f7ca9fed3fed616d555ee Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 3 Jul 2025 16:05:57 +0200 Subject: [PATCH 18/47] chore: add oepnai provider --- package-lock.json | 17 +++++++++++++++++ package.json | 3 ++- tests/accuracy/sdk/models.ts | 18 +++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index cae14a31..63ac51e6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -63,6 +63,23 @@ "@himanshusinghs/ai-sdk-google": { "extraneous": true }, + "node_modules/@ai-sdk/anthropic": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.12.tgz", + "integrity": "sha512-YSzjlko7JvuiyQFmI9RN1tNZdEiZxc+6xld/0tq/VkJaHpEzGAb1yiNxxvmYVcjvfu/PcvCxAAYXmTYQQ63IHQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, "node_modules/@ai-sdk/azure": { "version": "1.3.23", "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", diff --git a/package.json b/package.json index 448310a1..686f3516 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,8 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", - "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy" + "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy", + "test:accuracy-file": "node --experimental-vm-modules node_modules/jest/bin/jest.js" }, "license": "Apache-2.0", "devDependencies": { diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index c653c79c..1fe4fd58 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -1,6 +1,7 @@ import { LanguageModelV1 } from "ai"; import { createGoogleGenerativeAI } from "@himanshusinghs/google"; import { createAzure } from "@ai-sdk/azure"; +import { createOpenAI } from "@ai-sdk/openai"; import { ollama } from "ollama-ai-provider"; export interface Model

{ @@ -12,6 +13,20 @@ export interface Model

{ export class OpenAIModel implements Model { constructor(readonly modelName: string) {} + isAvailable(): boolean { + return !!process.env.MDB_OPEN_AI_API_KEY; + } + + getModel() { + return createOpenAI({ + apiKey: process.env.MDB_OPEN_AI_API_KEY, + })(this.modelName); + } +} + +export class AzureOpenAIModel implements Model { + constructor(readonly modelName: string) {} + isAvailable(): boolean { return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; } @@ -53,7 +68,8 @@ export class OllamaModel implements Model { const ALL_TESTABLE_MODELS = [ new GeminiModel("gemini-2.0-flash"), - new OpenAIModel("gpt-4o"), + // new OpenAIModel("gpt-4o"), + // new AzureOpenAIModel("gpt-4o"), // new OllamaModel("qwen3:1.7b"), ]; From f79facac534c3d53ebbb72ebea73b79b651e04ff Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 4 Jul 2025 07:28:52 +0200 Subject: [PATCH 19/47] chore: fixes accuracy scorer for position independent matching --- tests/accuracy/sdk/accuracy-scorers.ts | 39 ++++++++++++++++---------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index bf92eead..7bd8b969 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -37,28 +37,37 @@ export function parameterMatchingAccuracyScorer( return 1; } - const toolCallScores: number[] = []; - const checkedToolCallIds = new Set(); + const usedActualIndexes = new Set(); + const scores: number[] = []; - for (const expectedToolCall of expectedToolCalls) { - const matchingActualToolCall = actualToolCalls.find( - (actualToolCall) => - actualToolCall.toolName === expectedToolCall.toolName && - !checkedToolCallIds.has(actualToolCall.toolCallId) - ); + for (const expectedCall of expectedToolCalls) { + // Find all unmatched actual tool calls with the same tool name + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter(({ call, index }) => !usedActualIndexes.has(index) && call.toolName === expectedCall.toolName); - if (!matchingActualToolCall) { - toolCallScores.push(0); + if (candidates.length === 0) { + scores.push(0); continue; } - checkedToolCallIds.add(matchingActualToolCall.toolCallId); - const score = compareParams(expectedToolCall.parameters, matchingActualToolCall.parameters); - toolCallScores.push(score); + // Pick the candidate with the best parameter match + let bestScore = -1; + let bestIndex = -1; + for (const { call, index } of candidates) { + const score = compareParams(expectedCall.parameters, call.parameters); + if (score > bestScore) { + bestScore = score; + bestIndex = index; + } + } + + usedActualIndexes.add(bestIndex); + scores.push(bestScore); } - const totalScore = toolCallScores.reduce((sum, score) => sum + score, 0); - return totalScore / toolCallScores.length; + const totalScore = scores.reduce((sum, score) => sum + score, 0); + return totalScore / scores.length; } /** From e0470bc696ffe2f2080edf437a77d957be430e65 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 4 Jul 2025 07:34:19 +0200 Subject: [PATCH 20/47] chore: replace mock mcp client with real (mockable) mcp client When writing test cases, I realized that it is too much duplicated effort to write and maintain mocks. So instead of having only a mocked mcp client, this commit introduces a real mcp client that talks to our mcp server and is still mockable. We are now setting up real MCP client with test data in mongodb database spun up for test suites. Mocking is still an option but we likely never feel the need for that. --- tests/accuracy/sdk/accuracy-testing-client.ts | 76 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 98 +-- tests/accuracy/sdk/test-tools.ts | 140 ---- .../test-data-dumps/comics.books.json | 608 ++++++++++++++ .../test-data-dumps/comics.characters.json | 576 ++++++++++++++ .../test-data-dumps/mflix.movies.json | 687 ++++++++++++++++ .../accuracy/test-data-dumps/mflix.shows.json | 750 ++++++++++++++++++ .../tools/mongodb/mongodbHelpers.ts | 53 +- 8 files changed, 2799 insertions(+), 189 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-testing-client.ts delete mode 100644 tests/accuracy/sdk/test-tools.ts create mode 100644 tests/accuracy/test-data-dumps/comics.books.json create mode 100644 tests/accuracy/test-data-dumps/comics.characters.json create mode 100644 tests/accuracy/test-data-dumps/mflix.movies.json create mode 100644 tests/accuracy/test-data-dumps/mflix.shows.json diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts new file mode 100644 index 00000000..de7a0671 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -0,0 +1,76 @@ +import path from "path"; +import { v4 as uuid } from "uuid"; +import { fileURLToPath } from "url"; +import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; + +import { ToolCall } from "./accuracy-scorers.js"; + +const __dirname = fileURLToPath(import.meta.url); +const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); +const cliScriptPath = path.join(distPath, "index.js"); + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; +export type MockedTools = Record; + +export class AccuracyTestingClient { + private mockedTools: MockedTools = {}; + private recordedToolCalls: ToolCall[] = []; + private constructor(private readonly client: Awaited>) {} + + async close() { + await this.client?.close(); + } + + async vercelTools() { + const vercelTools = (await this.client?.tools()) ?? {}; + const rewrappedVercelTools: typeof vercelTools = {}; + for (const [toolName, tool] of Object.entries(vercelTools)) { + rewrappedVercelTools[toolName] = createVercelTool({ + ...tool, + execute: async (args, options) => { + this.recordedToolCalls.push({ + toolCallId: uuid(), + toolName: toolName, + parameters: args, + }); + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } + + return tool.execute(args, options); + }, + }); + } + + return rewrappedVercelTools; + } + + getToolCalls() { + return this.recordedToolCalls; + } + + mockTools(mockedTools: MockedTools) { + this.mockedTools = mockedTools; + } + + resetForTests() { + this.mockTools({}); + this.recordedToolCalls = []; + } + + static async initializeClient(mdbConnectionString: string) { + const clientTransport = new StdioClientTransport({ + command: process.execPath, + args: [cliScriptPath, "--connectionString", mdbConnectionString], + }); + + const client = await createMCPClient({ + transport: clientTransport, + }); + + return new AccuracyTestingClient(client); + } +} diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index c602bf96..dd224387 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,9 +1,8 @@ -import { Tool } from "@modelcontextprotocol/sdk/types.js"; -import { discoverMongoDBTools, TestTools, MockedTools } from "./test-tools.js"; import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; import { Agent, getVercelToolCallingAgent } from "./agent.js"; -import { appendAccuracySnapshot } from "./accuracy-snapshot.js"; +import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; +import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -13,68 +12,71 @@ export interface AccuracyTestConfig { mockedTools: MockedTools; } +export function describeSuite(suiteName: string, testConfigs: AccuracyTestConfig[]) { + return { + [suiteName]: testConfigs, + }; +} + export function describeAccuracyTests( - suiteName: string, models: TestableModels, - accuracyTestConfigs: AccuracyTestConfig[] + accuracyTestConfigs: { + [suiteName: string]: AccuracyTestConfig[]; + } ) { - const accuracyDatetime = process.env.MDB_ACCURACY_DATETIME; - const accuracyCommit = process.env.MDB_ACCURACY_COMMIT; - if (!models.length) { - console.warn(`No models available to test ${suiteName}`); - return; + throw new Error("No models available to test!"); } const eachModel = describe.each(models); - const eachTest = it.each(accuracyTestConfigs); + const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); + + eachModel(`$modelName`, function (model) { + const mdbIntegration = setupMongoDBIntegrationTest(); + const populateTestData = prepareTestData(mdbIntegration); - eachModel(`$modelName - ${suiteName}`, function (model) { - let mcpTools: Tool[]; - let testTools: TestTools; + let testMCPClient: AccuracyTestingClient; let agent: Agent; beforeAll(async () => { - mcpTools = await discoverMongoDBTools(); + testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); + agent = getVercelToolCallingAgent(); }); - beforeEach(() => { - testTools = new TestTools(mcpTools); - agent = getVercelToolCallingAgent(); + beforeEach(async () => { + await populateTestData(); + testMCPClient.resetForTests(); + }); + + afterAll(async () => { + await testMCPClient.close(); }); - eachTest("$prompt", async function (testConfig) { - testTools.mockTools(testConfig.mockedTools); - const toolsForModel = testTools.vercelAiTools(); - const promptForModel = testConfig.injectConnectedAssumption - ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") - : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, toolsForModel); - const toolCalls = testTools.getToolCalls(); - const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - const parameterMatchingAccuracy = parameterMatchingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug( - "Tool calling accuracy: %s, Parameter Accuracy: %s", - toolCallingAccuracy, - parameterMatchingAccuracy - ); - if (accuracyDatetime && accuracyCommit) { - await appendAccuracySnapshot({ - datetime: accuracyDatetime, - commit: accuracyCommit, - model: model.modelName, - suite: suiteName, - test: testConfig.prompt, + eachSuite("%s", function (suiteName) { + const eachTest = it.each(accuracyTestConfigs[suiteName] ?? []); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = testConfig.injectConnectedAssumption + ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") + : testConfig.prompt; + const conversation = await agent.prompt(promptForModel, model, toolsForModel); + const toolCalls = testMCPClient.getToolCalls(); + const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); + const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( + testConfig.expectedToolCalls, + toolCalls + ); + console.debug(testConfig.prompt); + console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); + console.debug( + "Tool calling accuracy: %s, Parameter Accuracy: %s", toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, - }); - } else { - console.info( - `Skipping accuracy snapshot update for ${model.modelName} - ${suiteName} - ${testConfig.prompt}` + parameterMatchingAccuracy ); - } + }); }); }); } diff --git a/tests/accuracy/sdk/test-tools.ts b/tests/accuracy/sdk/test-tools.ts deleted file mode 100644 index 15bb0420..00000000 --- a/tests/accuracy/sdk/test-tools.ts +++ /dev/null @@ -1,140 +0,0 @@ -import { JSONSchema7 } from "json-schema"; -import { v4 as uuid } from "uuid"; -import { Tool as VercelTool, Schema, tool as createVercelTool, jsonSchema } from "ai"; -import { Client } from "@modelcontextprotocol/sdk/client/index.js"; -import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { CallToolResult, Tool } from "@modelcontextprotocol/sdk/types.js"; - -import { InMemoryTransport } from "../../integration/inMemoryTransport.js"; -import { defaultTestConfig } from "../../integration/helpers.js"; -import { Session } from "../../../src/session.js"; -import { Telemetry } from "../../../src/telemetry/telemetry.js"; -import { Server } from "../../../src/server.js"; -import { ToolCall } from "./accuracy-scorers.js"; - -type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; -export type MockedTools = Record; - -function getDefaultToolResultGeneratorFn(): ToolResultGeneratorFn { - return () => ({ - content: [ - { - type: "text", - text: `Mock implementation for tool not present`, - }, - ], - isError: true, - }); -} - -export class TestTools { - private mockedTools: MockedTools = {}; - private recordedToolCalls: ToolCall[] = []; - - constructor(private readonly mcpTools: Tool[]) { - for (const mcpTool of mcpTools) { - this.mockedTools[mcpTool.name] = getDefaultToolResultGeneratorFn(); - } - } - - getToolCalls() { - return this.recordedToolCalls; - } - - mockTools(mockedTools: MockedTools) { - for (const toolName in mockedTools) { - const toolResultGeneratorFn = mockedTools[toolName]; - if (!this.mockedTools[toolName]) { - throw new Error(`Attempted to mock unrecognized tool - ${toolName}`); - } - - if (!toolResultGeneratorFn) { - // Are you happy TS? - continue; - } - this.mockedTools[toolName] = toolResultGeneratorFn; - } - } - - vercelAiTools(): Record>> { - const vercelTools: Record>> = {}; - for (const tool of this.mcpTools) { - vercelTools[tool.name] = createVercelTool({ - description: tool.description, - parameters: jsonSchema(tool.inputSchema as JSONSchema7), - // eslint-disable-next-line @typescript-eslint/require-await - execute: async (args: unknown) => { - this.recordedToolCalls.push({ - toolCallId: uuid(), - toolName: tool.name, - parameters: args, - }); - const toolResultGeneratorFn = this.mockedTools[tool.name]; - if (!toolResultGeneratorFn) { - return { - content: [ - { - type: "text", - text: `Could not resolve tool generator for ${tool.name}`, - }, - ], - }; - } - - return await toolResultGeneratorFn(args); - }, - }); - } - return vercelTools; - } -} - -export async function discoverMongoDBTools(): Promise { - let mcpClient: Client | undefined; - let mcpServer: Server | undefined; - try { - const serverTransport = new InMemoryTransport(); - const clientTransport = new InMemoryTransport(); - - await serverTransport.start(); - await clientTransport.start(); - - void serverTransport.output.pipeTo(clientTransport.input); - void clientTransport.output.pipeTo(serverTransport.input); - - const session = new Session({ - apiBaseUrl: defaultTestConfig.apiBaseUrl, - }); - - const telemetry = Telemetry.create(session, defaultTestConfig); - - mcpClient = new Client( - { - name: "tool-discovery-client", - version: "0.0.0", - }, - { - capabilities: {}, - } - ); - - mcpServer = new Server({ - session, - userConfig: defaultTestConfig, - telemetry, - mcpServer: new McpServer({ - name: "test-server", - version: "5.2.3", - }), - }); - - await mcpServer.connect(serverTransport); - await mcpClient.connect(clientTransport); - - return (await mcpClient.listTools()).tools; - } finally { - await mcpClient?.close(); - await mcpServer?.session?.close(); - await mcpServer?.close(); - } -} diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json new file mode 100644 index 00000000..3bcb9ecc --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -0,0 +1,608 @@ +[ + { + "_id": "fa53ead3-36f3-414c-9b3a-53aa9cf5038a", + "title": "Configurable dedicated project", + "publisher": "Dark Horse Comics", + "release_date": "2007-03-02T00:00:00", + "issues": 118, + "main_characters": [ + "Stephen Shaw" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", + "title": "Focused intangible service-desk", + "publisher": "Image Comics", + "release_date": "1998-12-07T00:00:00", + "issues": 137, + "main_characters": [ + "Margaret Hogan" + ], + "genre": [ + "Adventure", + "Horror" + ] + }, + { + "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", + "title": "Expanded secondary system engine", + "publisher": "DC Comics", + "release_date": "2012-12-01T00:00:00", + "issues": 227, + "main_characters": [ + "Joseph Cook", + "Tammy Bishop" + ], + "genre": [ + "Superhero" + ] + }, + { + "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", + "title": "Customizable zero-defect Graphic Interface", + "publisher": "DC Comics", + "release_date": "2011-02-24T00:00:00", + "issues": 270, + "main_characters": [ + "Sandra Moss" + ], + "genre": [ + "Fantasy" + ] + }, + { + "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", + "title": "Reduced eco-centric help-desk", + "publisher": "Dark Horse Comics", + "release_date": "2021-03-12T00:00:00", + "issues": 202, + "main_characters": [ + "Margaret Hogan", + "Angelica Stein", + "Tammy Murphy", + "Larry Hensley" + ], + "genre": [ + "Adventure", + "Horror" + ] + }, + { + "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", + "title": "Triple-buffered dedicated help-desk", + "publisher": "Image Comics", + "release_date": "1964-09-20T00:00:00", + "issues": 36, + "main_characters": [ + "Richard Cooper", + "James Sanchez", + "Micheal Brown", + "Jeremy Rice" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", + "title": "Operative logistical secured line", + "publisher": "Marvel Comics", + "release_date": "2007-11-19T00:00:00", + "issues": 55, + "main_characters": [ + "Joseph Bowman", + "Robert Logan", + "Ashley Watkins" + ], + "genre": [ + "Sci-Fi", + "Horror" + ] + }, + { + "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", + "title": "Multi-lateral multi-state framework", + "publisher": "IDW Publishing", + "release_date": "2011-09-14T00:00:00", + "issues": 250, + "main_characters": [ + "Ashley Watkins", + "Virginia Watts", + "Lindsay Anderson", + "Scott Garcia" + ], + "genre": [ + "Action", + "Horror" + ] + }, + { + "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", + "title": "Re-engineered encompassing standardization", + "publisher": "Marvel Comics", + "release_date": "1987-04-16T00:00:00", + "issues": 235, + "main_characters": [ + "Julie Goodwin" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", + "title": "Fully-configurable local success", + "publisher": "Dark Horse Comics", + "release_date": "1979-09-13T00:00:00", + "issues": 239, + "main_characters": [ + "Chad Pham", + "Lindsay Anderson", + "Carlos Burton" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", + "title": "Realigned zero-defect capability", + "publisher": "Marvel Comics", + "release_date": "2023-10-01T00:00:00", + "issues": 163, + "main_characters": [ + "Kevin Humphrey", + "Maria Wright", + "Virginia Watts" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", + "title": "Sharable bottom-line frame", + "publisher": "IDW Publishing", + "release_date": "2016-09-28T00:00:00", + "issues": 14, + "main_characters": [ + "Brian Vincent" + ], + "genre": [ + "Sci-Fi", + "Fantasy" + ] + }, + { + "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", + "title": "Centralized next generation middleware", + "publisher": "Image Comics", + "release_date": "1970-04-16T00:00:00", + "issues": 5, + "main_characters": [ + "Joseph Cook" + ], + "genre": [ + "Fantasy" + ] + }, + { + "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", + "title": "Re-engineered heuristic array", + "publisher": "IDW Publishing", + "release_date": "2019-02-15T00:00:00", + "issues": 121, + "main_characters": [ + "Angelica Stein", + "Benjamin Morris", + "Jeremy Rice" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", + "title": "Programmable transitional collaboration", + "publisher": "DC Comics", + "release_date": "1999-08-10T00:00:00", + "issues": 235, + "main_characters": [ + "Joseph Cook", + "Cynthia Brown", + "Carlos Burton", + "Micheal Brown" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", + "title": "Object-based dynamic knowledgebase", + "publisher": "Image Comics", + "release_date": "1993-02-24T00:00:00", + "issues": 189, + "main_characters": [ + "Cristian Oneal", + "Brian Vincent", + "Holly Green", + "James Sanchez" + ], + "genre": [ + "Sci-Fi", + "Fantasy" + ] + }, + { + "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", + "title": "Enhanced asynchronous matrices", + "publisher": "DC Comics", + "release_date": "2001-03-01T00:00:00", + "issues": 176, + "main_characters": [ + "Justin Martinez", + "Tammy Murphy" + ], + "genre": [ + "Action", + "Fantasy" + ] + }, + { + "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", + "title": "Synergized maximized artificial intelligence", + "publisher": "DC Comics", + "release_date": "1976-09-05T00:00:00", + "issues": 68, + "main_characters": [ + "Christopher Elliott", + "Maria Wright" + ], + "genre": [ + "Superhero", + "Adventure" + ] + }, + { + "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", + "title": "Switchable bottom-line complexity", + "publisher": "Marvel Comics", + "release_date": "2012-08-12T00:00:00", + "issues": 156, + "main_characters": [ + "Lindsay Anderson", + "Virginia Watts", + "Robert Logan", + "Margaret Hogan" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", + "title": "Triple-buffered impactful customer loyalty", + "publisher": "Marvel Comics", + "release_date": "1976-09-18T00:00:00", + "issues": 275, + "main_characters": [ + "Sandra Moss", + "Charles Blair", + "Justin Martinez" + ], + "genre": [ + "Fantasy", + "Action" + ] + }, + { + "_id": "da5be16e-13e8-42d5-8954-bd89919395af", + "title": "Programmable 24/7 website", + "publisher": "DC Comics", + "release_date": "2023-11-06T00:00:00", + "issues": 278, + "main_characters": [ + "Luis Callahan", + "Carlos Burton", + "Cristian Oneal", + "Michelle Valdez" + ], + "genre": [ + "Horror", + "Fantasy" + ] + }, + { + "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", + "title": "Advanced incremental framework", + "publisher": "Image Comics", + "release_date": "2008-07-21T00:00:00", + "issues": 109, + "main_characters": [ + "Holly Green", + "Diana Mata", + "Julie Goodwin" + ], + "genre": [ + "Horror", + "Sci-Fi" + ] + }, + { + "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", + "title": "Front-line coherent system engine", + "publisher": "DC Comics", + "release_date": "2012-04-27T00:00:00", + "issues": 297, + "main_characters": [ + "Joshua Hicks" + ], + "genre": [ + "Action", + "Horror" + ] + }, + { + "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", + "title": "Progressive systematic superstructure", + "publisher": "Image Comics", + "release_date": "1996-02-20T00:00:00", + "issues": 295, + "main_characters": [ + "Margaret Hogan", + "Christopher Elliott", + "Joseph Cook" + ], + "genre": [ + "Fantasy", + "Adventure" + ] + }, + { + "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", + "title": "Implemented national help-desk", + "publisher": "DC Comics", + "release_date": "2015-05-11T00:00:00", + "issues": 257, + "main_characters": [ + "Lindsay Anderson", + "James Sanchez", + "Julie Goodwin", + "Charles Blair" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", + "title": "Down-sized impactful workforce", + "publisher": "IDW Publishing", + "release_date": "2024-06-19T00:00:00", + "issues": 259, + "main_characters": [ + "Debbie Green" + ], + "genre": [ + "Sci-Fi", + "Superhero" + ] + }, + { + "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", + "title": "Re-engineered leadingedge structure", + "publisher": "DC Comics", + "release_date": "2011-04-14T00:00:00", + "issues": 282, + "main_characters": [ + "Larry Hensley", + "Joseph Cook", + "Brian Vincent", + "Sandra Moss" + ], + "genre": [ + "Adventure" + ] + }, + { + "_id": "71b845f3-4416-430a-81eb-8c208f824365", + "title": "Cloned 3rdgeneration contingency", + "publisher": "Dark Horse Comics", + "release_date": "2002-07-11T00:00:00", + "issues": 238, + "main_characters": [ + "Larry Hensley", + "Margaret Hogan", + "Holly Green", + "Joseph Bowman" + ], + "genre": [ + "Superhero", + "Fantasy" + ] + }, + { + "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", + "title": "Secured zero tolerance monitoring", + "publisher": "DC Comics", + "release_date": "1969-11-30T00:00:00", + "issues": 104, + "main_characters": [ + "Micheal Brown" + ], + "genre": [ + "Horror", + "Superhero" + ] + }, + { + "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", + "title": "Automated bifurcated access", + "publisher": "Image Comics", + "release_date": "1990-01-24T00:00:00", + "issues": 74, + "main_characters": [ + "Robert Logan" + ], + "genre": [ + "Sci-Fi" + ] + }, + { + "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", + "title": "Universal high-level pricing structure", + "publisher": "DC Comics", + "release_date": "1971-04-21T00:00:00", + "issues": 135, + "main_characters": [ + "Jeremy Rice", + "Elizabeth Robinson", + "James Sanchez" + ], + "genre": [ + "Action", + "Sci-Fi" + ] + }, + { + "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", + "title": "Reduced optimizing strategy", + "publisher": "Dark Horse Comics", + "release_date": "1984-06-24T00:00:00", + "issues": 111, + "main_characters": [ + "Joshua Hicks", + "Jeremy Rice", + "Micheal Brown" + ], + "genre": [ + "Fantasy", + "Superhero" + ] + }, + { + "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", + "title": "Virtual non-volatile groupware", + "publisher": "DC Comics", + "release_date": "2013-05-22T00:00:00", + "issues": 13, + "main_characters": [ + "Luis Callahan", + "Tammy Bishop", + "Cynthia Brown" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", + "title": "Horizontal disintermediate extranet", + "publisher": "DC Comics", + "release_date": "2021-12-03T00:00:00", + "issues": 129, + "main_characters": [ + "Margaret Hogan" + ], + "genre": [ + "Action" + ] + }, + { + "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", + "title": "Cross-platform discrete framework", + "publisher": "Dark Horse Comics", + "release_date": "2001-08-02T00:00:00", + "issues": 38, + "main_characters": [ + "James Sanchez", + "Larry Hensley" + ], + "genre": [ + "Superhero" + ] + }, + { + "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", + "title": "Cross-platform regional info-mediaries", + "publisher": "Image Comics", + "release_date": "2005-03-30T00:00:00", + "issues": 150, + "main_characters": [ + "Carlos Burton" + ], + "genre": [ + "Superhero", + "Fantasy" + ] + }, + { + "_id": "88904f06-50a6-44f1-bccc-f379a9788611", + "title": "Mandatory 6thgeneration secured line", + "publisher": "Image Comics", + "release_date": "2021-06-27T00:00:00", + "issues": 262, + "main_characters": [ + "Luis Callahan" + ], + "genre": [ + "Sci-Fi", + "Superhero" + ] + }, + { + "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", + "title": "Exclusive interactive concept", + "publisher": "IDW Publishing", + "release_date": "1969-06-03T00:00:00", + "issues": 264, + "main_characters": [ + "Scott Garcia", + "Joseph Bowman" + ], + "genre": [ + "Fantasy", + "Superhero" + ] + }, + { + "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", + "title": "Focused intermediate methodology", + "publisher": "DC Comics", + "release_date": "2004-03-19T00:00:00", + "issues": 210, + "main_characters": [ + "Justin Martinez", + "Julie Goodwin", + "Benjamin Morris", + "Virginia Watts" + ], + "genre": [ + "Adventure", + "Action" + ] + }, + { + "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", + "title": "Right-sized contextually-based toolset", + "publisher": "IDW Publishing", + "release_date": "2007-12-27T00:00:00", + "issues": 117, + "main_characters": [ + "Debbie Green", + "Christopher Elliott", + "Joshua Hicks" + ], + "genre": [ + "Sci-Fi", + "Action" + ] + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json new file mode 100644 index 00000000..944c33d5 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -0,0 +1,576 @@ +[ + { + "_id": "d7047787-abea-40fa-b78e-939925fd3589", + "name": "Elizabeth Robinson", + "alias": "ashley62", + "powers": [ + "Shapeshifting", + "Telepathy", + "Flight" + ], + "first_appearance": "1961-06-23T00:00:00", + "affiliations": [ + "Fantastic Four", + "X-Men" + ], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", + "name": "Joshua Wang", + "alias": "paulasmith", + "powers": [ + "Telekinesis" + ], + "first_appearance": "1987-04-16T00:00:00", + "affiliations": [ + "Fantastic Four", + "Justice League" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", + "name": "Stephen Shaw", + "alias": "adamskenneth", + "powers": [ + "Super Speed", + "Flight" + ], + "first_appearance": "2004-07-26T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", + "name": "Joseph Bowman", + "alias": "amysalazar", + "powers": [ + "Time Manipulation" + ], + "first_appearance": "1961-07-03T00:00:00", + "affiliations": [ + "Teen Titans", + "Avengers" + ], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", + "name": "Debbie Green", + "alias": "steventodd", + "powers": [ + "Energy Blasts", + "Regeneration" + ], + "first_appearance": "2021-12-05T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", + "name": "Christopher Elliott", + "alias": "barajasmitchell", + "powers": [ + "Flight", + "Invisibility", + "Telekinesis" + ], + "first_appearance": "1947-03-23T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", + "name": "Tammy Murphy", + "alias": "jessicagill", + "powers": [ + "Super Strength", + "Telekinesis" + ], + "first_appearance": "2000-07-06T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", + "name": "Scott Garcia", + "alias": "whitechristie", + "powers": [ + "Telepathy", + "Energy Blasts" + ], + "first_appearance": "2000-11-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", + "name": "Julie Goodwin", + "alias": "robertsmith", + "powers": [ + "Telepathy", + "Super Speed" + ], + "first_appearance": "1953-08-09T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", + "name": "Joshua Hicks", + "alias": "cynthia32", + "powers": [ + "Super Strength", + "Invisibility", + "Telekinesis" + ], + "first_appearance": "1967-07-17T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", + "name": "Justin Martinez", + "alias": "janicebrown", + "powers": [ + "Super Speed", + "Super Strength" + ], + "first_appearance": "1973-09-19T00:00:00", + "affiliations": [ + "Avengers" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", + "name": "Holly Green", + "alias": "ystanley", + "powers": [ + "Shapeshifting", + "Energy Blasts" + ], + "first_appearance": "2013-08-05T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": true + }, + { + "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", + "name": "Margaret Hogan", + "alias": "wendyconway", + "powers": [ + "Super Speed", + "Telepathy" + ], + "first_appearance": "1944-08-13T00:00:00", + "affiliations": [ + "Justice League", + "X-Men" + ], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", + "name": "Ashley Watkins", + "alias": "cjohnson", + "powers": [ + "Shapeshifting" + ], + "first_appearance": "1940-09-13T00:00:00", + "affiliations": [ + "Fantastic Four", + "Guardians of the Galaxy" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", + "name": "Tammy Bishop", + "alias": "geoffreyryan", + "powers": [ + "Regeneration" + ], + "first_appearance": "1984-11-04T00:00:00", + "affiliations": [ + "Fantastic Four", + "X-Men" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", + "name": "Michelle Valdez", + "alias": "manuelcobb", + "powers": [ + "Regeneration", + "Energy Blasts" + ], + "first_appearance": "2014-08-04T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", + "name": "Joseph Cook", + "alias": "scott40", + "powers": [ + "Telepathy", + "Telekinesis" + ], + "first_appearance": "1976-04-01T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "0738b98f-4699-4609-9156-fb6a1085a503", + "name": "Jeremy Rice", + "alias": "james82", + "powers": [ + "Invisibility" + ], + "first_appearance": "1977-09-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", + "name": "Chad Pham", + "alias": "smithjennifer", + "powers": [ + "Telepathy" + ], + "first_appearance": "2001-05-26T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "d545ec48-680c-4493-8650-d759bedabb7e", + "name": "Diana Mata", + "alias": "zwilliamson", + "powers": [ + "Super Speed", + "Energy Blasts", + "Invisibility" + ], + "first_appearance": "2010-11-21T00:00:00", + "affiliations": [], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", + "name": "Maria Wright", + "alias": "yraymond", + "powers": [ + "Flight", + "Telepathy" + ], + "first_appearance": "1971-04-15T00:00:00", + "affiliations": [ + "Avengers", + "Teen Titans" + ], + "origin": "Asgard", + "is_villain": true + }, + { + "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", + "name": "Carlos Burton", + "alias": "rperkins", + "powers": [ + "Super Speed", + "Time Manipulation", + "Telekinesis" + ], + "first_appearance": "1970-01-20T00:00:00", + "affiliations": [ + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", + "name": "Lindsay Anderson", + "alias": "amycox", + "powers": [ + "Super Strength", + "Telekinesis" + ], + "first_appearance": "1976-04-30T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", + "name": "Larry Hensley", + "alias": "ylester", + "powers": [ + "Super Strength", + "Invisibility", + "Shapeshifting" + ], + "first_appearance": "2019-01-21T00:00:00", + "affiliations": [ + "Guardians of the Galaxy", + "Avengers" + ], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", + "name": "Sandra Moss", + "alias": "alexandra81", + "powers": [ + "Telekinesis", + "Super Speed" + ], + "first_appearance": "1989-07-28T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", + "name": "Cynthia Brown", + "alias": "freed", + "powers": [ + "Super Strength", + "Energy Blasts" + ], + "first_appearance": "2015-06-19T00:00:00", + "affiliations": [ + "Fantastic Four" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", + "name": "Brian Vincent", + "alias": "ghowell", + "powers": [ + "Invisibility", + "Flight", + "Super Speed" + ], + "first_appearance": "2012-05-12T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", + "name": "Kevin Humphrey", + "alias": "mary44", + "powers": [ + "Super Strength", + "Super Speed", + "Telepathy" + ], + "first_appearance": "1993-05-10T00:00:00", + "affiliations": [ + "Justice League", + "Teen Titans" + ], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", + "name": "Luis Callahan", + "alias": "ashleyreeves", + "powers": [ + "Telekinesis" + ], + "first_appearance": "1943-11-02T00:00:00", + "affiliations": [ + "X-Men" + ], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", + "name": "Micheal Brown", + "alias": "lisa85", + "powers": [ + "Telepathy", + "Flight", + "Time Manipulation" + ], + "first_appearance": "1983-11-04T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", + "name": "James Sanchez", + "alias": "mary95", + "powers": [ + "Energy Blasts", + "Telekinesis" + ], + "first_appearance": "1999-05-20T00:00:00", + "affiliations": [ + "Justice League" + ], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", + "name": "Richard Cooper", + "alias": "james85", + "powers": [ + "Telekinesis", + "Energy Blasts", + "Super Speed" + ], + "first_appearance": "2021-11-27T00:00:00", + "affiliations": [ + "Justice League", + "Fantastic Four" + ], + "origin": "Mars", + "is_villain": true + }, + { + "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", + "name": "Charles Blair", + "alias": "barbara60", + "powers": [ + "Super Strength" + ], + "first_appearance": "2012-05-03T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", + "name": "Virginia Watts", + "alias": "klane", + "powers": [ + "Telekinesis" + ], + "first_appearance": "2016-04-27T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "495f64a9-123e-46d4-9ddb-21692353a849", + "name": "Robert Logan", + "alias": "griffinsean", + "powers": [ + "Telepathy" + ], + "first_appearance": "2003-07-16T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", + "name": "Cheyenne Powell", + "alias": "laurenolsen", + "powers": [ + "Time Manipulation", + "Energy Blasts" + ], + "first_appearance": "1964-02-05T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", + "name": "Benjamin Morris", + "alias": "sierra18", + "powers": [ + "Telekinesis", + "Regeneration", + "Shapeshifting" + ], + "first_appearance": "1964-09-27T00:00:00", + "affiliations": [ + "X-Men", + "Avengers" + ], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", + "name": "Cristian Oneal", + "alias": "harrellamy", + "powers": [ + "Super Speed" + ], + "first_appearance": "1965-01-29T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", + "name": "Jessica Vargas", + "alias": "chadherrera", + "powers": [ + "Energy Blasts", + "Super Strength", + "Telekinesis" + ], + "first_appearance": "1974-03-29T00:00:00", + "affiliations": [ + "X-Men", + "Teen Titans" + ], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "f3fa712d-2124-433a-b405-c02757fa1503", + "name": "Angelica Stein", + "alias": "reedjason", + "powers": [ + "Invisibility" + ], + "first_appearance": "1981-01-02T00:00:00", + "affiliations": [ + "Avengers" + ], + "origin": "Earth", + "is_villain": true + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json new file mode 100644 index 00000000..cd35382e --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -0,0 +1,687 @@ +[ + { + "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", + "title": "Human sell", + "release_year": 1993, + "genres": [ + "Sci-Fi" + ], + "director": "Christina Collins", + "cast": [ + "Jeremy Marks", + "Matthew Moore", + "Erica Miller", + "Beth Morales" + ], + "runtime": 139, + "rating": 9.3 + }, + { + "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", + "title": "Trial we much", + "release_year": 2020, + "genres": [ + "Horror", + "Comedy" + ], + "director": "Steven Miles", + "cast": [ + "Patrick Huynh", + "Darrell Thompson", + "Lindsay Thompson", + "Brandi Cooper" + ], + "runtime": 149, + "rating": 5.0 + }, + { + "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", + "title": "Someone", + "release_year": 1996, + "genres": [ + "Action", + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Carrie Cummings", + "Patricia Rice", + "Suzanne Collins", + "April Murray", + "Kimberly Shaw" + ], + "runtime": 153, + "rating": 2.6 + }, + { + "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", + "title": "Without our", + "release_year": 2012, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Rodney Gray", + "Mr. Joseph Allen", + "Heather Robles", + "Eric Edwards", + "James Wilson" + ], + "runtime": 143, + "rating": 9.1 + }, + { + "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", + "title": "Cost anything", + "release_year": 2002, + "genres": [ + "Romance", + "Action" + ], + "director": "Bryan Andrews", + "cast": [ + "Gregory Mullins", + "Jillian Arroyo", + "Angela Reed" + ], + "runtime": 112, + "rating": 3.8 + }, + { + "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", + "title": "Hold green energy their", + "release_year": 1989, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Eduardo Carey", + "Jodi Miller", + "Ronald Johnson", + "Lindsay Hernandez" + ], + "runtime": 126, + "rating": 7.4 + }, + { + "_id": "1b81c45b-1d09-47dc-871f-ace109107446", + "title": "Choose ability start", + "release_year": 1990, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Bryan Andrews", + "cast": [ + "Tyler Daniels", + "Gregory Harris", + "Whitney Swanson", + "Pamela Ramirez" + ], + "runtime": 141, + "rating": 5.6 + }, + { + "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", + "title": "Cover perhaps", + "release_year": 2022, + "genres": [ + "Drama" + ], + "director": "Daniel Wallace", + "cast": [ + "Victoria Price", + "Holly Ross", + "Michele Jones" + ], + "runtime": 173, + "rating": 4.3 + }, + { + "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", + "title": "Policy particularly", + "release_year": 2003, + "genres": [ + "Comedy" + ], + "director": "Brittany Parker", + "cast": [ + "Emily Haynes", + "Crystal Johnson", + "Ernest Jones" + ], + "runtime": 154, + "rating": 6.6 + }, + { + "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", + "title": "Store care", + "release_year": 2017, + "genres": [ + "Romance", + "Sci-Fi" + ], + "director": "Sara Stewart", + "cast": [ + "Katherine Matthews", + "Stacey Wolf", + "Laurie Blackwell", + "Luis Ortiz", + "Christopher Vasquez" + ], + "runtime": 168, + "rating": 7.7 + }, + { + "_id": "99e75e60-6466-4314-92c3-00c433a06600", + "title": "Section close bad", + "release_year": 2024, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Bryan Andrews", + "cast": [ + "Heather Marshall", + "Alexander Austin", + "Stephanie Villarreal MD", + "Ryan Marquez" + ], + "runtime": 180, + "rating": 7.7 + }, + { + "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", + "title": "Become stand", + "release_year": 2001, + "genres": [ + "Sci-Fi", + "Thriller" + ], + "director": "Brian Martinez", + "cast": [ + "Robert Ross", + "Kimberly Williamson", + "Pam Wyatt" + ], + "runtime": 162, + "rating": 1.5 + }, + { + "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", + "title": "I case", + "release_year": 2012, + "genres": [ + "Drama", + "Comedy" + ], + "director": "Brittany Parker", + "cast": [ + "Justin Davis", + "Karen Doyle", + "Daniel Jackson", + "Courtney Mcdonald" + ], + "runtime": 122, + "rating": 3.1 + }, + { + "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", + "title": "No organization style", + "release_year": 2013, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Benjamin Whitney", + "Joseph Bush", + "Barbara Griffin" + ], + "runtime": 167, + "rating": 9.6 + }, + { + "_id": "15855c7b-ece2-4238-b995-57f6207509ea", + "title": "Computer garden", + "release_year": 2012, + "genres": [ + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Darlene Lee", + "Tina Wang", + "Nathan Mayo" + ], + "runtime": 146, + "rating": 6.5 + }, + { + "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", + "title": "Trip information feel", + "release_year": 2008, + "genres": [ + "Action", + "Thriller" + ], + "director": "Brittany Parker", + "cast": [ + "Kelly Walsh", + "Michael Rocha" + ], + "runtime": 148, + "rating": 9.8 + }, + { + "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", + "title": "It project low part", + "release_year": 1992, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Sheena Murphy", + "Amanda Miller", + "Erica Curtis", + "Roger Jones", + "Andrew Simpson" + ], + "runtime": 161, + "rating": 2.4 + }, + { + "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", + "title": "Near attorney discuss", + "release_year": 1983, + "genres": [ + "Comedy" + ], + "director": "Christina Collins", + "cast": [ + "Chase Myers", + "Benjamin Kelly", + "Thomas Summers MD", + "Jessica Woods" + ], + "runtime": 174, + "rating": 9.5 + }, + { + "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", + "title": "Whether know", + "release_year": 2009, + "genres": [ + "Comedy", + "Thriller" + ], + "director": "Bryan Andrews", + "cast": [ + "Amy Reed", + "William Williams", + "Steven Lawrence" + ], + "runtime": 134, + "rating": 9.6 + }, + { + "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", + "title": "Against place", + "release_year": 2017, + "genres": [ + "Drama", + "Romance" + ], + "director": "Daniel Wallace", + "cast": [ + "Brittany Thompson", + "Clinton Bishop", + "Terri Meyer", + "Stacey Phillips", + "Alexander Hunt" + ], + "runtime": 152, + "rating": 5.0 + }, + { + "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", + "title": "Return yard", + "release_year": 1994, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Mason Lara", + "Taylor Salinas", + "Tim Foster", + "Erin Sharp" + ], + "runtime": 99, + "rating": 8.8 + }, + { + "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", + "title": "Certain fish", + "release_year": 2009, + "genres": [ + "Romance" + ], + "director": "Steven Miles", + "cast": [ + "Jonathan King", + "Caitlyn Costa DDS", + "Steve Davis", + "Perry Anderson" + ], + "runtime": 130, + "rating": 8.6 + }, + { + "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", + "title": "Agreement like program", + "release_year": 2004, + "genres": [ + "Sci-Fi" + ], + "director": "Daniel Jackson", + "cast": [ + "Ashley Green", + "Rebecca Osborne", + "Robert Williams", + "Breanna Dunn", + "Philip Vargas" + ], + "runtime": 110, + "rating": 8.1 + }, + { + "_id": "791688be-4358-45ab-956e-71fe3fd35d19", + "title": "Floor seven then", + "release_year": 2009, + "genres": [ + "Horror" + ], + "director": "Daniel Wallace", + "cast": [ + "Dustin Wright", + "Crystal Young" + ], + "runtime": 143, + "rating": 4.8 + }, + { + "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", + "title": "Like rather paper", + "release_year": 2006, + "genres": [ + "Drama" + ], + "director": "Spencer Gillespie", + "cast": [ + "Sean Moyer", + "James Edwards", + "Tara Lee", + "Robert Scott" + ], + "runtime": 175, + "rating": 9.1 + }, + { + "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", + "title": "Argue hospital", + "release_year": 1994, + "genres": [ + "Romance", + "Sci-Fi" + ], + "director": "Amanda Young", + "cast": [ + "Carolyn Williams", + "Jasmin Sampson", + "Phillip Levy", + "Brenda Clark", + "Lauren Perry" + ], + "runtime": 149, + "rating": 9.5 + }, + { + "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", + "title": "Become after card", + "release_year": 1986, + "genres": [ + "Sci-Fi", + "Horror" + ], + "director": "Brian Martinez", + "cast": [ + "Rhonda Ochoa", + "Charlene Castillo" + ], + "runtime": 100, + "rating": 8.5 + }, + { + "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", + "title": "Born authority attention", + "release_year": 1994, + "genres": [ + "Romance" + ], + "director": "Brian Martinez", + "cast": [ + "Matthew Thomas", + "Carly Perkins" + ], + "runtime": 131, + "rating": 4.9 + }, + { + "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", + "title": "Local seven media", + "release_year": 1998, + "genres": [ + "Sci-Fi", + "Drama" + ], + "director": "Amanda Young", + "cast": [ + "Jessica Perez", + "Larry Atkinson" + ], + "runtime": 95, + "rating": 2.0 + }, + { + "_id": "498597d2-3254-46ef-a800-f322a86fbd55", + "title": "Keep employee", + "release_year": 1981, + "genres": [ + "Horror" + ], + "director": "Christina Collins", + "cast": [ + "Alexis Carlson", + "Andrew Stewart" + ], + "runtime": 161, + "rating": 6.0 + }, + { + "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", + "title": "American question generation", + "release_year": 1986, + "genres": [ + "Romance" + ], + "director": "Daniel Jackson", + "cast": [ + "Troy Carter", + "Peter Hernandez", + "Christine Brown" + ], + "runtime": 176, + "rating": 8.0 + }, + { + "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", + "title": "Maintain out", + "release_year": 2000, + "genres": [ + "Sci-Fi", + "Action" + ], + "director": "Brian Martinez", + "cast": [ + "Nancy Evans", + "Michael Gill", + "Justin Carroll" + ], + "runtime": 179, + "rating": 10.0 + }, + { + "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", + "title": "Ten box study", + "release_year": 2011, + "genres": [ + "Horror", + "Romance" + ], + "director": "Steven Miles", + "cast": [ + "Mark Hicks", + "Michelle Dean", + "John Buchanan", + "Veronica Johnson" + ], + "runtime": 147, + "rating": 2.5 + }, + { + "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", + "title": "Production operation", + "release_year": 2014, + "genres": [ + "Horror", + "Romance" + ], + "director": "Sara Stewart", + "cast": [ + "Ashley Mata", + "Mark Kelly", + "John West", + "Harold Day" + ], + "runtime": 125, + "rating": 4.1 + }, + { + "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", + "title": "What language", + "release_year": 2004, + "genres": [ + "Sci-Fi" + ], + "director": "Sara Stewart", + "cast": [ + "Scott Mckenzie", + "Jason Lee", + "Nathan Gardner", + "Jamie Greene", + "Angela Garner" + ], + "runtime": 177, + "rating": 3.7 + }, + { + "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", + "title": "Up usually central", + "release_year": 2011, + "genres": [ + "Sci-Fi", + "Comedy" + ], + "director": "Daniel Jackson", + "cast": [ + "Jennifer Carlson", + "Jonathan Stewart DDS", + "Amy Lester" + ], + "runtime": 159, + "rating": 5.6 + }, + { + "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", + "title": "For boy only", + "release_year": 1987, + "genres": [ + "Thriller", + "Action" + ], + "director": "Sara Stewart", + "cast": [ + "Gene Smith", + "Robert Osborne Jr.", + "Laura Fox", + "Alexis Lowe" + ], + "runtime": 95, + "rating": 3.6 + }, + { + "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", + "title": "Site win including your", + "release_year": 2008, + "genres": [ + "Sci-Fi" + ], + "director": "Spencer Gillespie", + "cast": [ + "John Williams", + "Jason Huang", + "Karen Klein", + "Gary Tran", + "Jessica Murphy" + ], + "runtime": 178, + "rating": 6.2 + }, + { + "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", + "title": "Sell huge hair", + "release_year": 1997, + "genres": [ + "Thriller", + "Action" + ], + "director": "Bryan Andrews", + "cast": [ + "Thomas Johnson", + "Ryan Morrow" + ], + "runtime": 157, + "rating": 4.4 + }, + { + "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", + "title": "Guy rest", + "release_year": 1997, + "genres": [ + "Sci-Fi", + "Horror" + ], + "director": "Steven Miles", + "cast": [ + "Michael Fox", + "Tyler Acosta", + "Tracy Adams" + ], + "runtime": 122, + "rating": 7.8 + } +] \ No newline at end of file diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json new file mode 100644 index 00000000..e91c26bb --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -0,0 +1,750 @@ +[ + { + "_id": "b586e37c-6b32-417d-a53c-2a4c1121b11b", + "title": "Object-based analyzing architecture", + "seasons": 8, + "episodes": 62, + "platform": "Amazon Prime", + "genres": [ + "Comedy" + ], + "cast": [ + "Roger Gomez", + "Sandra Williams", + "Matthew Rodriguez", + "Scott Brown", + "Kristie Horn", + "Nicole Avila" + ], + "start_year": 2014, + "end_year": null + }, + { + "_id": "c28471ea-336f-4060-9b18-0bbff3de6622", + "title": "Customer-focused encompassing architecture", + "seasons": 4, + "episodes": 108, + "platform": "Hulu", + "genres": [ + "Thriller" + ], + "cast": [ + "Joseph Holmes", + "Patrick Smith", + "Charles Delacruz" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "93f0969b-2377-4531-9c4e-45d2593015cd", + "title": "User-centric background approach", + "seasons": 6, + "episodes": 49, + "platform": "HBO", + "genres": [ + "Comedy", + "Documentary" + ], + "cast": [ + "Jason Castillo", + "Jessica Burke", + "Philip Lewis", + "Philip Goodman", + "Corey Lee" + ], + "start_year": 2016, + "end_year": 2018 + }, + { + "_id": "a0b76db0-99a1-49fe-a5ea-fe802a66bde9", + "title": "Networked directional budgetary management", + "seasons": 5, + "episodes": 23, + "platform": "Amazon Prime", + "genres": [ + "Comedy", + "Thriller" + ], + "cast": [ + "Mark Allen", + "Anthony Snyder", + "Kimberly Jones" + ], + "start_year": 2002, + "end_year": null + }, + { + "_id": "fbdef9b9-1ad4-4a6b-a39a-2e0b90423cb5", + "title": "Enterprise-wide dynamic intranet", + "seasons": 1, + "episodes": 12, + "platform": "Amazon Prime", + "genres": [ + "Crime", + "Documentary" + ], + "cast": [ + "Matthew Green", + "Kelly Wright", + "Tonya Sullivan", + "Daniel Brown" + ], + "start_year": 2009, + "end_year": 2020 + }, + { + "_id": "db54ab5c-bf6b-48ea-8272-1b1a4a76b848", + "title": "Exclusive real-time access", + "seasons": 10, + "episodes": 76, + "platform": "Amazon Prime", + "genres": [ + "Drama" + ], + "cast": [ + "Stacey Shaw", + "Zachary Steele", + "Laurie Martinez" + ], + "start_year": 2011, + "end_year": 2020 + }, + { + "_id": "53869b62-c8c7-48b3-86c9-17c935b43ff6", + "title": "Persevering leadingedge application", + "seasons": 5, + "episodes": 73, + "platform": "HBO", + "genres": [ + "Thriller" + ], + "cast": [ + "Diane Boyd", + "Anna Rubio", + "Cheryl Fisher", + "Tyler Villa" + ], + "start_year": 2008, + "end_year": 2020 + }, + { + "_id": "3be07c4d-5275-4181-b2f6-5b1a1e46aa7b", + "title": "Multi-lateral analyzing model", + "seasons": 2, + "episodes": 114, + "platform": "Amazon Prime", + "genres": [ + "Fantasy" + ], + "cast": [ + "Kathleen Marshall", + "Kimberly Quinn", + "Steven Parker", + "Adrienne Green", + "Justin Hughes", + "Jean Smith" + ], + "start_year": 2017, + "end_year": 2023 + }, + { + "_id": "50cb455b-5ec0-4e68-8601-43e58defb762", + "title": "User-centric tangible monitoring", + "seasons": 3, + "episodes": 55, + "platform": "Disney+", + "genres": [ + "Drama" + ], + "cast": [ + "Barbara Clark", + "Carolyn Scott", + "Timothy Reed", + "Cory Burton", + "Jacob Hill" + ], + "start_year": 2006, + "end_year": 2012 + }, + { + "_id": "bab2dba4-88bd-4b24-afce-8781eb280d53", + "title": "Persevering background monitoring", + "seasons": 4, + "episodes": 61, + "platform": "Amazon Prime", + "genres": [ + "Comedy", + "Fantasy" + ], + "cast": [ + "Adam Lin", + "Evan Smith", + "Christine Howard", + "Ruben Hopkins" + ], + "start_year": 2006, + "end_year": 2023 + }, + { + "_id": "518f2ad9-bb65-4228-8d4c-7a62b9f88599", + "title": "Cross-group intangible architecture", + "seasons": 1, + "episodes": 90, + "platform": "HBO", + "genres": [ + "Comedy" + ], + "cast": [ + "Eric Ryan", + "Ashley Ball", + "Douglas Barton", + "Brian Whitehead", + "Michael Greer" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "d5f9304d-567d-4335-b43c-ec4034d7009f", + "title": "Programmable bottom-line monitoring", + "seasons": 10, + "episodes": 69, + "platform": "Hulu", + "genres": [ + "Documentary", + "Fantasy" + ], + "cast": [ + "Mrs. Olivia Booth", + "William Murphy", + "Patricia Payne", + "Lisa Estes", + "Jason Martin", + "Jeff Greene" + ], + "start_year": 2011, + "end_year": 2024 + }, + { + "_id": "27718a30-6e42-47ad-8adf-1533b9b8a419", + "title": "Multi-lateral multi-tasking contingency", + "seasons": 3, + "episodes": 89, + "platform": "Disney+", + "genres": [ + "Crime" + ], + "cast": [ + "Elizabeth Lambert", + "Corey Hughes", + "Melissa Stephens" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "defc7620-3b4e-46ff-a949-bec1af753812", + "title": "Focused zero administration migration", + "seasons": 9, + "episodes": 73, + "platform": "Disney+", + "genres": [ + "Documentary", + "Drama" + ], + "cast": [ + "Shane Richardson", + "Lisa Cooper", + "Samantha Perkins" + ], + "start_year": 2008, + "end_year": null + }, + { + "_id": "9d6781fb-d095-4a00-932d-3f1fac1b0049", + "title": "Horizontal methodical encoding", + "seasons": 8, + "episodes": 40, + "platform": "Netflix", + "genres": [ + "Crime" + ], + "cast": [ + "Patricia Barrett", + "Scott Gonzalez", + "Michaela Johnson" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "ac19b1b1-2bf9-4093-83fa-60411aa3f80f", + "title": "Enterprise-wide analyzing product", + "seasons": 8, + "episodes": 61, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Christie Waters", + "Casey Allen", + "Nicole Frank" + ], + "start_year": 2001, + "end_year": 2005 + }, + { + "_id": "2dfd2240-dc9f-439f-9e06-b1ec8de397bf", + "title": "Compatible well-modulated extranet", + "seasons": 10, + "episodes": 89, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Pedro Butler", + "Christian Hall", + "Dawn Gregory", + "Shannon Russell", + "Omar Mullins", + "Ian Ramos" + ], + "start_year": 2012, + "end_year": 2013 + }, + { + "_id": "94db1534-7163-430e-83e3-6a75bc6aec0f", + "title": "User-centric tangible infrastructure", + "seasons": 5, + "episodes": 11, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Deborah Garcia", + "Michelle Barajas", + "Melissa Reynolds", + "Douglas Wilson" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "65b2213f-a606-42d8-b845-0199ba2e9b82", + "title": "Inverse optimal circuit", + "seasons": 1, + "episodes": 29, + "platform": "Amazon Prime", + "genres": [ + "Fantasy", + "Documentary" + ], + "cast": [ + "Grace Rodriguez", + "Alison Greene", + "Michael Allen", + "Steven Hayden" + ], + "start_year": 2013, + "end_year": null + }, + { + "_id": "5a8a2745-e57c-4086-aa09-84131f40149f", + "title": "Public-key discrete alliance", + "seasons": 9, + "episodes": 111, + "platform": "Disney+", + "genres": [ + "Documentary" + ], + "cast": [ + "Emily Irwin", + "Olivia Gibson", + "Jean Hernandez", + "Michael Cummings" + ], + "start_year": 2013, + "end_year": 2022 + }, + { + "_id": "51326558-2080-4615-a583-b4f2fbd15600", + "title": "Managed zero administration groupware", + "seasons": 8, + "episodes": 108, + "platform": "Hulu", + "genres": [ + "Drama", + "Crime" + ], + "cast": [ + "Karen Phillips", + "Kelly Marsh", + "Daniel Hamilton", + "Abigail Smith" + ], + "start_year": 2018, + "end_year": 2019 + }, + { + "_id": "87a2cd5f-75ee-4650-b2a4-a56384c97137", + "title": "Reverse-engineered static initiative", + "seasons": 6, + "episodes": 66, + "platform": "Amazon Prime", + "genres": [ + "Crime", + "Documentary" + ], + "cast": [ + "Bradley Chavez", + "Catherine Horn", + "Joseph Bryant", + "Tara Rodriguez" + ], + "start_year": 2003, + "end_year": 2006 + }, + { + "_id": "0f647458-d09f-4be8-b1dc-49be1ba1e104", + "title": "Fundamental tangible matrices", + "seasons": 9, + "episodes": 22, + "platform": "Hulu", + "genres": [ + "Drama" + ], + "cast": [ + "Eric Lee", + "Patrick Estrada", + "Kelsey Brown", + "Jeffrey Lewis" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "53d34237-0e86-4a5e-922b-0589c2e65458", + "title": "Self-enabling homogeneous infrastructure", + "seasons": 5, + "episodes": 35, + "platform": "Hulu", + "genres": [ + "Crime" + ], + "cast": [ + "Chad Torres", + "Mark Williams", + "Terry Mcguire", + "Kathleen Cantu", + "Harold Knapp" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "71cc1515-ba84-4df6-92db-55af3cfa91f0", + "title": "Horizontal web-enabled application", + "seasons": 2, + "episodes": 94, + "platform": "Netflix", + "genres": [ + "Thriller", + "Fantasy" + ], + "cast": [ + "Catherine Davila", + "Jessica James", + "Cory Miller", + "Alexis Sanchez", + "Andrew Miller" + ], + "start_year": 2002, + "end_year": 2017 + }, + { + "_id": "200556f7-10c6-4414-83f7-24ef74bff12a", + "title": "User-friendly bi-directional data-warehouse", + "seasons": 2, + "episodes": 87, + "platform": "Hulu", + "genres": [ + "Drama", + "Fantasy" + ], + "cast": [ + "Tiffany Brown", + "Christina Morales", + "Samuel Blake", + "Stephanie Johnson", + "Wesley Deleon" + ], + "start_year": 2020, + "end_year": null + }, + { + "_id": "613832c9-5307-4c80-9dde-3eab4e5aa770", + "title": "Pre-emptive leadingedge capacity", + "seasons": 5, + "episodes": 56, + "platform": "Netflix", + "genres": [ + "Comedy" + ], + "cast": [ + "James Durham", + "Jessica Myers", + "Rachel King" + ], + "start_year": 2005, + "end_year": null + }, + { + "_id": "f9cb1076-3eaf-41d2-84df-057d27c1a544", + "title": "Fundamental intangible contingency", + "seasons": 4, + "episodes": 99, + "platform": "Disney+", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Robert Foster", + "Jill Barton", + "Kimberly Simmons", + "Tracey Gomez" + ], + "start_year": 2017, + "end_year": 2020 + }, + { + "_id": "f96b112f-943e-43cd-90f0-56725cfa7e59", + "title": "Diverse asymmetric forecast", + "seasons": 9, + "episodes": 24, + "platform": "Amazon Prime", + "genres": [ + "Drama", + "Crime" + ], + "cast": [ + "Carl Johnson", + "Douglas Beck", + "Kevin Guerra", + "Taylor Wilson", + "Eric Jarvis", + "Sarah Charles MD" + ], + "start_year": 2007, + "end_year": null + }, + { + "_id": "78eb682f-a03d-4cbf-bbfc-0e899e5f50d0", + "title": "Profit-focused solution-oriented Graphical User Interface", + "seasons": 10, + "episodes": 117, + "platform": "HBO", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Carol Miller", + "Jennifer Bass", + "Melanie Leblanc" + ], + "start_year": 2002, + "end_year": null + }, + { + "_id": "ebb6d3c9-3c98-4799-94bc-aadd0bf2974c", + "title": "Reduced leadingedge system engine", + "seasons": 1, + "episodes": 58, + "platform": "Hulu", + "genres": [ + "Crime", + "Drama" + ], + "cast": [ + "James Warren", + "Kelly Carter", + "Sarah Jones", + "Aaron Castaneda", + "Katherine Manning" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "4ffd32a7-0bf4-4c95-a7c8-19002c2eb83c", + "title": "Switchable 24/7 website", + "seasons": 6, + "episodes": 71, + "platform": "Netflix", + "genres": [ + "Documentary" + ], + "cast": [ + "Sarah Brown", + "Patrick Beck", + "Angela Herrera MD", + "Steven Mcconnell" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "37267325-4337-4912-992f-a162f9014569", + "title": "Synergized asymmetric adapter", + "seasons": 4, + "episodes": 16, + "platform": "Hulu", + "genres": [ + "Fantasy" + ], + "cast": [ + "Gabrielle Meyer", + "Madison Matthews", + "Taylor Martinez" + ], + "start_year": 2010, + "end_year": null + }, + { + "_id": "ea2abd77-c7da-443e-89fd-6f410f5d697e", + "title": "Extended contextually-based customer loyalty", + "seasons": 1, + "episodes": 79, + "platform": "Hulu", + "genres": [ + "Fantasy" + ], + "cast": [ + "Michael Lewis", + "Cassandra Hicks", + "Sydney Garcia" + ], + "start_year": 2015, + "end_year": 2023 + }, + { + "_id": "b568dd56-c083-4431-a740-4f4b5f4e1b21", + "title": "Versatile grid-enabled application", + "seasons": 7, + "episodes": 82, + "platform": "Hulu", + "genres": [ + "Crime", + "Fantasy" + ], + "cast": [ + "Keith Brown", + "Annette Johnson", + "Joseph Carroll", + "Derek Lewis" + ], + "start_year": 2006, + "end_year": 2008 + }, + { + "_id": "b6f2e1c3-6915-4e02-b1c2-44b5bec8fd68", + "title": "Operative optimizing encryption", + "seasons": 2, + "episodes": 52, + "platform": "Amazon Prime", + "genres": [ + "Fantasy", + "Drama" + ], + "cast": [ + "Garrett Mcgrath", + "Craig Jackson", + "Michael Sullivan", + "Andrew Boyer" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "51c225d5-aa67-4b14-aca5-33757cef6bf4", + "title": "Business-focused 24/7 collaboration", + "seasons": 1, + "episodes": 113, + "platform": "Netflix", + "genres": [ + "Thriller", + "Comedy" + ], + "cast": [ + "Matthew Hill", + "Andrew White", + "Grant Young", + "John Mathews" + ], + "start_year": 2015, + "end_year": 2020 + }, + { + "_id": "7465e69f-341e-4234-8ffb-400622442a40", + "title": "Organized bi-directional application", + "seasons": 3, + "episodes": 40, + "platform": "Netflix", + "genres": [ + "Comedy" + ], + "cast": [ + "Matthew Gordon", + "Mark Allen", + "Amanda Webb", + "Jeffrey Horton", + "Sheila Lewis", + "Marcus Gilbert" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "90570eac-f923-4c30-a5b0-661b28a8e4a5", + "title": "Configurable bottom-line success", + "seasons": 10, + "episodes": 106, + "platform": "HBO", + "genres": [ + "Fantasy", + "Drama" + ], + "cast": [ + "Elizabeth Taylor", + "Melissa Mullins", + "Alan Nguyen", + "Carolyn Kidd", + "Michael Pope" + ], + "start_year": 2015, + "end_year": null + }, + { + "_id": "06d70791-5487-4dab-8b84-a91b3376e396", + "title": "Organic dedicated analyzer", + "seasons": 3, + "episodes": 88, + "platform": "HBO", + "genres": [ + "Thriller", + "Drama" + ], + "cast": [ + "Amy Aguilar", + "James Williams", + "Kevin Kirby" + ], + "start_year": 2010, + "end_year": 2025 + } +] \ No newline at end of file diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index 935b27db..778cb430 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -2,12 +2,37 @@ import { MongoCluster } from "mongodb-runner"; import path from "path"; import { fileURLToPath } from "url"; import fs from "fs/promises"; -import { MongoClient, ObjectId } from "mongodb"; +import { Document, MongoClient, ObjectId } from "mongodb"; import { getResponseContent, IntegrationTest, setupIntegrationTest, defaultTestConfig } from "../../helpers.js"; import { UserConfig } from "../../../../src/config.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const testDataDumpPath = path.join(__dirname, "..", "..", "..", "accuracy", "test-data-dumps"); + +const testDataPaths = [ + { + db: "comics", + collection: "books", + path: path.join(testDataDumpPath, "comics.books.json"), + }, + { + db: "comics", + collection: "characters", + path: path.join(testDataDumpPath, "comics.characters.json"), + }, + { + db: "mflix", + collection: "movies", + path: path.join(testDataDumpPath, "mflix.movies.json"), + }, + { + db: "mflix", + collection: "shows", + path: path.join(testDataDumpPath, "mflix.shows.json"), + }, +]; + interface MongoDBIntegrationTest { mongoClient: () => MongoClient; connectionString: () => string; @@ -169,3 +194,29 @@ export function validateAutoConnectBehavior( }); }); } + +export function prepareTestData(integration: MongoDBIntegrationTest) { + const testData: { + db: string; + collection: string; + data: Document[]; + }[] = []; + + beforeAll(async () => { + for (const { db, collection, path } of testDataPaths) { + testData.push({ + db, + collection, + data: JSON.parse(await fs.readFile(path, "utf8")) as Document[], + }); + } + }); + + return async function populateTestData() { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).dropCollection(collection); + await client.db(db).collection(collection).insertMany(data); + } + }; +} From b961916983a226de840dc3c6603e26cbc239f8a0 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Sun, 6 Jul 2025 23:49:34 +0200 Subject: [PATCH 21/47] chore: moved all existing tests to vercel mcp client --- src/tools/mongodb/read/find.ts | 2 +- tests/accuracy/collection-indexes.test.ts | 38 ++--- tests/accuracy/collection-schema.test.ts | 12 +- tests/accuracy/delete-many.test.ts | 38 ++--- tests/accuracy/find.test.ts | 150 +++++++----------- tests/accuracy/insert-many.test.ts | 51 +++--- tests/accuracy/list-collections.test.ts | 62 ++++---- tests/accuracy/list-databases.test.ts | 30 ++-- tests/accuracy/sdk/accuracy-testing-client.ts | 28 +++- tests/accuracy/sdk/describe-accuracy-tests.ts | 2 +- tests/accuracy/sdk/models.ts | 4 +- 11 files changed, 181 insertions(+), 236 deletions(-) diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index 5e3fa4f4..ac864b0a 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -13,7 +13,7 @@ export const FindArgs = { .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z .record(z.string(), z.unknown()) - // .optional() + .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index 683f386a..e53ddb43 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -1,42 +1,30 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { collectionIndexesResponse } from "../../src/tools/mongodb/read/collectionIndexes.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-indexes": function collectionIndexes() { - return collectionIndexesResponse({ - database: "db1", - collection: "coll1", - indexes: [ - { - name: "year", - key: JSON.stringify({ _id: 1 }), - }, - ], - }); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "collection-indexes", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], }; } -describeAccuracyTests("collection-indexes", getAvailableModels(), [ - callsCollectionIndexes("How many indexes do I have in 'db1.coll1' namespace?"), - callsCollectionIndexes("List all the indexes in coll1 collection in db1 database"), - callsCollectionIndexes( - `Is the following query: ${JSON.stringify({ year: 1994 })} on the namespace 'db1.coll1' indexed?` - ), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'collection-indexes' tool", [ + callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), + callsCollectionIndexes("List all the indexes in movies collection in mflix database"), + callsCollectionIndexes( + `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` + ), + ]), +}); diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index e72c65de..f81273ea 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; @@ -41,7 +41,9 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests("collection-schema", getAvailableModels(), [ - callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), - callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'collection-schema' tool", [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), + ]), +}); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index ddda1d50..4d50169d 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,4 +1,4 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; @@ -7,17 +7,13 @@ function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "delete-many": function listDatabases() { - return deleteManyResponse("coll1", 10); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", }, }, ], @@ -28,26 +24,26 @@ function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "delete-many": function listDatabases() { - return deleteManyResponse("coll1", 10); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", parameters: { - database: "db1", - collection: "coll1", - filters: { provider: "BongoDB" }, + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, }, }, ], }; } -describeAccuracyTests("delete-many", getAvailableModels(), [ - callsDeleteManyWithEmptyFilters("Delete all the documents from 'db1.coll1' namespace"), - callsDeleteManyWithEmptyFilters("Purge the collection 'coll1' in database 'db1'"), - callsDeleteManyWithFilters("Remove all the documents from namespace 'db1.coll1' where provider is 'BongoDB'"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'delete-many' tool", [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), + callsDeleteManyWithFilters( + "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100" + ), + ]), +}); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index 0144e22b..ecfbe4f3 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,157 +1,129 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { findResponse } from "../../src/tools/mongodb/read/find.js"; -import { MockedTools } from "./sdk/test-tools.js"; -import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; -import { getSimplifiedSchema } from "mongodb-schema"; -const documents = [ - { - title: "book1", - author: "author1", - date_of_publish: "01.01.1990", - }, - { - title: "book2", - author: "author1", - date_of_publish: "01.01.1992", - }, - { - title: "book3", - author: "author2", - date_of_publish: "01.01.1990", - }, -]; - -function callsFindNoFilter(prompt: string): AccuracyTestConfig { +function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", documents), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", + database, + collection, }, }, ], }; } -function callsFindWithFilter(prompt: string): AccuracyTestConfig { +function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => - findResponse( - "coll1", - documents.filter((doc) => doc.author === "author1") - ), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - filter: { author: "author1" }, + database: "mflix", + collection: "movies", + filter: filter, }, }, ], }; } -function callsFindWithProjection(prompt: string): AccuracyTestConfig { +function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", documents), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - projection: { title: 1 }, + database: "mflix", + collection: "movies", + projection, }, }, ], }; } -function callsFindWithProjectionAndFilters(prompt: string): AccuracyTestConfig { +function callsFindWithProjectionAndFilters( + prompt: string, + filter: Record, + projection: Record +): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => - findResponse( - "coll1", - documents.filter((doc) => doc.date_of_publish === "01.01.1992") - ), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - filter: { date_of_publish: "01.01.1992" }, - projection: { title: 1 }, + database: "mflix", + collection: "movies", + filter, + projection, }, }, ], }; } -function callsFindWithSortAndLimit(prompt: string): AccuracyTestConfig { +function callsFindWithFilterSortAndLimit( + prompt: string, + filter: Record, + sort: Record, + limit: number +): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async () => - collectionSchemaResponse("db1", "coll1", await getSimplifiedSchema(documents)), - find: () => findResponse("coll1", [documents[0], documents[1]]), - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "find", parameters: { - database: "db1", - collection: "coll1", - sort: { date_of_publish: 1 }, - limit: 2, + database: "mflix", + collection: "movies", + filter, + sort, + limit, }, }, ], }; } -describeAccuracyTests("find", getAvailableModels(), [ - callsFindNoFilter("List all the documents in 'db1.coll1' namespace"), - callsFindNoFilter("Find all the documents from collection coll1 in database db1"), - callsFindWithFilter("Find all the books published by author name 'author1' in db1.coll1 namespace"), - callsFindWithFilter("Find all the documents in coll1 collection and db1 database where author is 'author1'"), - callsFindWithProjection("Give me all the title of the books available in 'db1.coll1' namespace"), - callsFindWithProjection("Give me all the title of the books published in available in 'db1.coll1' namespace"), - callsFindWithProjectionAndFilters( - "Find all the book titles from 'db1.coll1' namespace where date_of_publish is '01.01.1992'" - ), - callsFindWithSortAndLimit("List first two books sorted by the field date_of_publish in namespace db1.coll1"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call find tool", [ + callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), + callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), + callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { + runtime: { $lt: 100 }, + }), + callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { + director: "Christina Collins", + }), + callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), + callsFindWithProjectionAndFilters( + "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + { title: "Certain Fish" }, + { cast: 1 } + ), + callsFindWithFilterSortAndLimit( + "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + { genres: "Horror" }, + { runtime: 1 }, + 2 + ), + ]), +}); diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts index b720ac1c..25d60017 100644 --- a/tests/accuracy/insert-many.test.ts +++ b/tests/accuracy/insert-many.test.ts @@ -1,35 +1,30 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { insertManyResponse } from "../../src/tools/mongodb/create/insertMany.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "insert-many": function listDatabases() { - return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", documents: [ { id: 1, - name: "name1", + title: "name1", }, { id: 2, - name: "name2", + title: "name2", }, { id: 3, - name: "name3", + title: "name3", }, ], }, @@ -42,17 +37,13 @@ function callsEmptyInsertMany(prompt: string) { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "insert-many": function listDatabases() { - return insertManyResponse("coll1", 3, ["1FOO", "2BAR", "3BAZ"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", parameters: { - database: "db1", - collection: "coll1", + database: "mflix", + collection: "movies", documents: [{}, {}, {}], }, }, @@ -60,13 +51,15 @@ function callsEmptyInsertMany(prompt: string) { }; } -describeAccuracyTests("insert-many", getAvailableModels(), [ - callsInsertMany( - [ - "In my namespace 'db1.coll1', insert 3 documents each with the following fields:", - "- id: an incremental number starting from 1", - "- name: a string of format 'name'", - ].join("\n") - ), - callsEmptyInsertMany("Add three empty documents in collection 'coll1' inside database 'db1'"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'insert-many' tool", [ + callsInsertMany( + [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), + ]), +}); diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index ac086859..a8455418 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,22 +1,16 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { listCollectionsResponse } from "../../src/tools/mongodb/metadata/listCollections.js"; -import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; function callsListCollections(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-collections": function listCollections() { - return listCollectionsResponse("db1", ["coll1", "coll2"]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-collections", - parameters: { database: "db1" }, + parameters: { database: "mflix" }, }, ], }; @@ -26,23 +20,7 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-collections": function listCollections() { - return listCollectionsResponse("db1", ["coll1", "coll2"]); - }, - "list-databases": function listDatabases() { - return listDatabasesResponse([ - { - name: "db1", - sizeOnDisk: "1024", - }, - { - name: "db2", - sizeOnDisk: "2048", - }, - ]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -50,19 +28,35 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi }, { toolName: "list-collections", - parameters: { database: "db1" }, + parameters: { database: "admin" }, + }, + { + toolName: "list-collections", + parameters: { database: "comics" }, + }, + { + toolName: "list-collections", + parameters: { database: "config" }, + }, + { + toolName: "list-collections", + parameters: { database: "local" }, }, { toolName: "list-collections", - parameters: { database: "db2" }, + parameters: { database: "mflix" }, }, ], }; } -describeAccuracyTests("list-collections", getAvailableModels(), [ - callsListCollections("How many collections do I have in database db1?"), - callsListCollections("List all the collections in my MongoDB database db1."), - callsListCollections("Is there a coll1 collection in my MongoDB database db1?"), - callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call list-collections tool", [ + callsListCollections("How many collections do I have in database mflix?"), + callsListCollections("List all the collections in my MongoDB database mflix."), + callsListCollections("Is there a shows collection in my MongoDB database mflix?"), + ]), + ...describeSuite("should call list-databases and list-collections tool", [ + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), + ]), +}); diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index 0a89db1d..0ef88712 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,26 +1,12 @@ -import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { listDatabasesResponse } from "../../src/tools/mongodb/metadata/listDatabases.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "list-databases": function listDatabases() { - return listDatabasesResponse([ - { - name: "db1", - sizeOnDisk: "1024", - }, - { - name: "db2", - sizeOnDisk: "2048", - }, - ]); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -30,8 +16,10 @@ function callsListDatabases(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests("list-databases", getAvailableModels(), [ - callsListDatabases("How many databases do I have?"), - callsListDatabases("List all the databases in my cluster."), - callsListDatabases("Is there a sample_mflix database in my cluster?"), -]); +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call list-databases tool", [ + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases that I have in my clusters"), + callsListDatabases("Is there a mflix database in my cluster?"), + ]), +}); diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index de7a0671..b12017d7 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -17,14 +17,14 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; private recordedToolCalls: ToolCall[] = []; - private constructor(private readonly client: Awaited>) {} + private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { - await this.client?.close(); + await this.vercelMCPClient?.close(); } async vercelTools() { - const vercelTools = (await this.client?.tools()) ?? {}; + const vercelTools = (await this.vercelMCPClient?.tools()) ?? {}; const rewrappedVercelTools: typeof vercelTools = {}; for (const [toolName, tool] of Object.entries(vercelTools)) { rewrappedVercelTools[toolName] = createVercelTool({ @@ -35,12 +35,24 @@ export class AccuracyTestingClient { toolName: toolName, parameters: args, }); - const toolResultGeneratorFn = this.mockedTools[toolName]; - if (toolResultGeneratorFn) { - return await toolResultGeneratorFn(args); - } + try { + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } - return tool.execute(args, options); + return await tool.execute(args, options); + } catch (error) { + // There are cases when LLM calls the tools incorrectly + // and the schema definition check fails. Normally a + // tool calling agent will handle the error case but + // because we are wrapping the tool definition ourselves + // we have to handle this ourselves as well. + return { + isError: true, + content: JSON.stringify(error), + }; + } }, }); } diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index dd224387..466a9ed7 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -69,7 +69,7 @@ export function describeAccuracyTests( toolCalls ); console.debug(testConfig.prompt); - console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); + // console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); console.debug( "Tool calling accuracy: %s, Parameter Accuracy: %s", diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 1fe4fd58..eb7f4b91 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -67,9 +67,9 @@ export class OllamaModel implements Model { } const ALL_TESTABLE_MODELS = [ - new GeminiModel("gemini-2.0-flash"), + // new GeminiModel("gemini-2.0-flash"), // new OpenAIModel("gpt-4o"), - // new AzureOpenAIModel("gpt-4o"), + new AzureOpenAIModel("gpt-4o"), // new OllamaModel("qwen3:1.7b"), ]; From 5ffee02a4f3468a19c5dea456e6aa17f91136b77 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 7 Jul 2025 15:50:18 +0200 Subject: [PATCH 22/47] chore: adds tests for the rest of the tools --- .../accuracy/collection-storage-size.test.ts | 51 ++++++++++++ tests/accuracy/count.test.ts | 60 ++++++++++++++ tests/accuracy/create-collection.test.ts | 57 +++++++++++++ tests/accuracy/db-stats.test.ts | 25 ++++++ tests/accuracy/drop-collection.test.ts | 82 +++++++++++++++++++ tests/accuracy/drop-database.test.ts | 50 +++++++++++ tests/accuracy/explain.test.ts | 72 ++++++++++++++++ tests/accuracy/logs.test.ts | 31 +++++++ tests/accuracy/rename-collection.test.ts | 49 +++++++++++ tests/accuracy/sdk/agent.ts | 2 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 3 +- tests/accuracy/update-many.test.ts | 60 ++++++++++++++ .../tools/mongodb/mongodbHelpers.ts | 24 ++++-- 13 files changed, 558 insertions(+), 8 deletions(-) create mode 100644 tests/accuracy/collection-storage-size.test.ts create mode 100644 tests/accuracy/count.test.ts create mode 100644 tests/accuracy/create-collection.test.ts create mode 100644 tests/accuracy/db-stats.test.ts create mode 100644 tests/accuracy/drop-collection.test.ts create mode 100644 tests/accuracy/drop-database.test.ts create mode 100644 tests/accuracy/explain.test.ts create mode 100644 tests/accuracy/logs.test.ts create mode 100644 tests/accuracy/rename-collection.test.ts create mode 100644 tests/accuracy/update-many.test.ts diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts new file mode 100644 index 00000000..751b84d6 --- /dev/null +++ b/tests/accuracy/collection-storage-size.test.ts @@ -0,0 +1,51 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'collection-storage-size' tool", [ + callsCollectionStorageSize("What is the size of 'mflix.movies' namespace", [ + { + toolName: "collection-storage-size", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ]), + ]), + ...describeSuite("should call 'collection-storage-size' tool after another tool/s", [ + callsCollectionStorageSize("How much size is each collection in comics database", [ + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "books", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "characters", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts new file mode 100644 index 00000000..0543af76 --- /dev/null +++ b/tests/accuracy/count.test.ts @@ -0,0 +1,60 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCountToolWithQuery( + prompt: string, + database = "mflix", + collection = "movies", + query: Record = {} +): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + query, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'count' tool", [ + callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), + callsCountToolWithEmptyQuery( + "How many documents are there in 'characters' collection in 'comics' database?", + "comics", + "characters" + ), + callsCountToolWithQuery( + "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + "mflix", + "movies", + { runtime: { $lt: 100 } } + ), + ]), +}); diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts new file mode 100644 index 00000000..ab468a62 --- /dev/null +++ b/tests/accuracy/create-collection.test.ts @@ -0,0 +1,57 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCreateCollectionWithListCollections(prompt: string, expectedToolCalls: ExpectedToolCall[]) { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'create-collection' tool", [ + callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), + callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), + ]), + ...describeSuite("should call 'create-collection' alongside other required tools", [ + callsCreateCollectionWithListCollections( + "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + [ + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", + }, + }, + ] + ), + ]), +}); diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts new file mode 100644 index 00000000..b88fbb3c --- /dev/null +++ b/tests/accuracy/db-stats.test.ts @@ -0,0 +1,25 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "db-stats", + parameters: { + database, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'db-stats' tool", [ + callsListDatabases("What is the size occupied by database mflix?"), + ]), +}); diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts new file mode 100644 index 00000000..e51494b7 --- /dev/null +++ b/tests/accuracy/drop-collection.test.ts @@ -0,0 +1,82 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }; +} + +function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'drop-collection' tool", [ + onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), + onlyCallsDropCollection("Drop movies collection from mflix database."), + ]), + ...describeSuite("should call 'drop-collection' after calling other necessary tools", [ + callsDropCollection("Remove books collection from which ever database contains it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts new file mode 100644 index 00000000..08ffe640 --- /dev/null +++ b/tests/accuracy/drop-database.test.ts @@ -0,0 +1,50 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }; +} + +function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'drop-database' tool", [ + onlyCallsDropDatabase("Remove mflix database from my cluster."), + onlyCallsDropDatabase("Drop database named mflix."), + ]), + ...describeSuite("should call 'drop-database' after calling other necessary tools", [ + callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ]), + ]), +}); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts new file mode 100644 index 00000000..6e767981 --- /dev/null +++ b/tests/accuracy/explain.test.ts @@ -0,0 +1,72 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsExplain(prompt: string, method: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [method], + }, + }, + ], + }; +} + +const callsExplainWithFind = (prompt: string) => + callsExplain(prompt, { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, + }); + +const callsExplainWithAggregate = (prompt: string) => + callsExplain(prompt, { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, + }); + +const callsExplainWithCount = (prompt: string) => + callsExplain(prompt, { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, + }); + +/** + * None of these tests score a parameter match on any of the models, likely + * because we are using Zod.union, when we probably should've used + * Zod.discriminatedUnion + */ +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'explain' tool for a find query", [ + callsExplainWithFind( + `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), + ...describeSuite("should call 'explain' tool for an aggregation", [ + callsExplainWithAggregate( + `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), + ...describeSuite("should call 'explain' tool for count", [ + callsExplainWithCount( + `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + ]), +}); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts new file mode 100644 index 00000000..afd2a697 --- /dev/null +++ b/tests/accuracy/logs.test.ts @@ -0,0 +1,31 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; + +function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [toolCall], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'logs' tool", [ + callsLogsTool("Were there any startup warnings for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }), + callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }), + ]), +}); diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts new file mode 100644 index 00000000..d8d46025 --- /dev/null +++ b/tests/accuracy/rename-collection.test.ts @@ -0,0 +1,49 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsRenameCollection(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + }, + }, + ], + }; +} + +function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + dropTarget: true, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call 'rename-collection' tool", [ + callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), + callsRenameCollectionWithDropTarget( + "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." + ), + ]), +}); diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index eb680358..6997ffb6 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -6,7 +6,7 @@ const systemPrompt = [ "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", "You MUST use the most relevant tool to answer the user's request", "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", - "If a task requires multiple steps, you MUST call the necessary tools in sequence", + "If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.", 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 466a9ed7..7a49b550 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -33,7 +33,7 @@ export function describeAccuracyTests( eachModel(`$modelName`, function (model) { const mdbIntegration = setupMongoDBIntegrationTest(); - const populateTestData = prepareTestData(mdbIntegration); + const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); let testMCPClient: AccuracyTestingClient; let agent: Agent; @@ -44,6 +44,7 @@ export function describeAccuracyTests( }); beforeEach(async () => { + await cleanupTestDatabases(mdbIntegration); await populateTestData(); testMCPClient.resetForTests(); }); diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts new file mode 100644 index 00000000..4b82fbfb --- /dev/null +++ b/tests/accuracy/update-many.test.ts @@ -0,0 +1,60 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + filter, + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should only call aggregate tool", [ + callsUpdateManyWithEmptyFilters( + "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1" + ), + callsUpdateManyWithFilters( + "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1", + { runtime: { $lt: 100 } } + ), + ]), +}); diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index 778cb430..8df9b059 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -196,6 +196,7 @@ export function validateAutoConnectBehavior( } export function prepareTestData(integration: MongoDBIntegrationTest) { + const NON_TEST_DBS = ["admin", "config", "local"]; const testData: { db: string; collection: string; @@ -212,11 +213,22 @@ export function prepareTestData(integration: MongoDBIntegrationTest) { } }); - return async function populateTestData() { - const client = integration.mongoClient(); - for (const { db, collection, data } of testData) { - await client.db(db).dropCollection(collection); - await client.db(db).collection(collection).insertMany(data); - } + return { + async populateTestData(this: void) { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).collection(collection).insertMany(data); + } + }, + async cleanupTestDatabases(this: void, integration: MongoDBIntegrationTest) { + const client = integration.mongoClient(); + const admin = client.db().admin(); + const databases = await admin.listDatabases(); + await Promise.all( + databases.databases + .filter(({ name }) => !NON_TEST_DBS.includes(name)) + .map(({ name }) => client.db(name).dropDatabase()) + ); + }, }; } From abec91a35f47704402d10101cab4e96ca01f1e88 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Mon, 7 Jul 2025 16:13:34 +0200 Subject: [PATCH 23/47] chore: adds missed out tests for tools --- tests/accuracy/aggregate.test.ts | 28 +++++++++++++++++++++++ tests/accuracy/create-index.test.ts | 35 +++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 tests/accuracy/aggregate.test.ts create mode 100644 tests/accuracy/create-index.test.ts diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts new file mode 100644 index 00000000..3da1ca32 --- /dev/null +++ b/tests/accuracy/aggregate.test.ts @@ -0,0 +1,28 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsAggregate(prompt: string, pipeline: Record[]): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "aggregate", + parameters: { + pipeline: pipeline, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'aggregate' tool", [ + callsAggregate( + "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", + [{ $group: { _id: "$release_year", count: { $sum: 1 } } }] + ), + ]), +}); diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts new file mode 100644 index 00000000..82e98e92 --- /dev/null +++ b/tests/accuracy/create-index.test.ts @@ -0,0 +1,35 @@ +import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + keys: indexKeys, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), { + ...describeSuite("should call 'create-index' tool", [ + callsCreateIndex( + "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + { + release_year: 1, + } + ), + callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { + title: "text", + }), + ]), +}); From 047da6aa1fb92348460525b5c51d979f0f1c9faa Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:06:13 +0200 Subject: [PATCH 24/47] chore: MongoDB based snapshot storage for accuracy runs introduces the following necessary env variables: - MDB_ACCURACY_RUN_ID: The accuracy run id - MDB_ACCURACY_MDB_URL: The connection string to mongodb instance where the snapshots will be stored - MDB_ACCURACY_MDB_DB: The database for snapshots - MDB_ACCURACY_MDB_COLLECTION: The collection for snapshots --- package-lock.json | 33 +++++++ package.json | 2 +- .../get-snapshot-storage.ts | 19 ++++ .../mdb-snapshot-storage.ts | 86 +++++++++++++++++++ .../snapshot-storage.ts | 51 +++++++++++ tests/accuracy/sdk/agent.ts | 29 +++++-- tests/accuracy/sdk/describe-accuracy-tests.ts | 31 ++++--- tests/accuracy/sdk/git-info.ts | 7 ++ 8 files changed, 242 insertions(+), 16 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts create mode 100644 tests/accuracy/sdk/git-info.ts diff --git a/package-lock.json b/package-lock.json index 63ac51e6..865dcb14 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3106,6 +3106,23 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@kwsites/file-exists": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/file-exists/-/file-exists-1.1.1.tgz", + "integrity": "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^4.1.1" + } + }, + "node_modules/@kwsites/promise-deferred": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/promise-deferred/-/promise-deferred-1.1.1.tgz", + "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", + "dev": true, + "license": "MIT" + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.0", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.0.tgz", @@ -15146,6 +15163,22 @@ "simple-concat": "^1.0.0" } }, + "node_modules/simple-git": { + "version": "3.28.0", + "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.28.0.tgz", + "integrity": "sha512-Rs/vQRwsn1ILH1oBUy8NucJlXmnnLeLCfcvbSehkPzbv3wwoFWIdtfd6Ndo6ZPhlPsCZ60CPI4rxurnwAa+a2w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.4.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/steveukx/git-js?sponsor=1" + } + }, "node_modules/simple-oauth2": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/simple-oauth2/-/simple-oauth2-5.1.0.tgz", diff --git a/package.json b/package.json index 686f3516..6f92e3c5 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "generate": "./scripts/generate.sh", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy", - "test:accuracy-file": "node --experimental-vm-modules node_modules/jest/bin/jest.js" + "test:accuracy-file": "MDB_ACCURACY_RUN_ID=$(npx uuid v4) node --experimental-vm-modules node_modules/jest/bin/jest.js" }, "license": "Apache-2.0", "devDependencies": { diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts new file mode 100644 index 00000000..44c8ae3d --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -0,0 +1,19 @@ +import { getCommitSHA } from "../git-info.js"; +import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; +import { AccuracySnapshotStorage } from "./snapshot-storage.js"; + +export async function getAccuracySnapshotStorage(): Promise { + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + if (!accuracyRunId) { + throw new Error( + "Cannot create AccuracySnapshotStorage without an accuracyRunId - ensure that the relevant env variable is present." + ); + } + + const commitSHA = await getCommitSHA(); + if (!commitSHA) { + throw new Error("Cannot create AccuracySnapshotStorage without a commitSHA."); + } + + return MongoDBSnapshotStorage.getStorage(commitSHA, accuracyRunId); +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts new file mode 100644 index 00000000..f8296a8a --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -0,0 +1,86 @@ +import { Collection, MongoClient } from "mongodb"; +import { AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage } from "./snapshot-storage.js"; + +export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { + private readonly client: MongoClient; + private readonly snapshotCollection: Collection; + private readonly accuracyRunId: string; + private readonly commitSHA: string; + private constructor({ + mongodbUrl, + database, + collection, + accuracyRunId, + commitSHA, + }: { + mongodbUrl: string; + database: string; + collection: string; + accuracyRunId: string; + commitSHA: string; + }) { + this.client = new MongoClient(mongodbUrl); + this.snapshotCollection = this.client.db(database).collection(collection); + this.accuracyRunId = accuracyRunId; + this.commitSHA = commitSHA; + } + + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "parameterAccuracy" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + commitSHA: this.commitSHA, + accuracyRunId: this.accuracyRunId, + createdOn: Date.now(), + }; + await this.snapshotCollection.insertOne(snapshotWithMeta); + } + + async getLastRunIdForCommit(commit: string): Promise { + const document = await this.snapshotCollection.findOne( + { commit: commit }, + { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } + ); + + return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; + } + + async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { + const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); + return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); + } + + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { + const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; + const database = process.env.MDB_ACCURACY_MDB_DB; + const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; + if (!mongodbUrl || !database || !collection) { + throw new Error("Cannot create MongoDBAccuracySnapshot storage without relevant configuration provided"); + } + + return new MongoDBSnapshotStorage({ + mongodbUrl, + database, + collection, + commitSHA, + accuracyRunId, + }); + } + + async close(): Promise { + await this.client.close(); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts new file mode 100644 index 00000000..a6f92807 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -0,0 +1,51 @@ +import z from "zod"; + +export const AccuracySnapshotEntrySchema = z.object({ + // Git and meta information for snapshot entries + accuracyRunId: z.string(), + createdOn: z.number(), + commitSHA: z.string(), + // Accuracy info + requestedModel: z.string(), + test: z.string(), + prompt: z.string(), + toolCallingAccuracy: z.number(), + parameterAccuracy: z.number(), + llmResponseTime: z.number(), + tokensUsage: z + .object({ + promptTokens: z.number().optional(), + completionTokens: z.number().optional(), + totalTokens: z.number().optional(), + }) + .optional(), + respondingModel: z.string(), + text: z.string(), + messages: z.array(z.record(z.string(), z.unknown())), +}); + +export type AccuracySnapshotEntry = z.infer; + +export interface AccuracySnapshotStorage { + createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "parameterAccuracy" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise; + + getLastRunIdForCommit(commit: string): Promise; + + getSnapshotEntriesForRunId(accuracyRunId: string): Promise; + + close(): Promise; +} diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 6997ffb6..4b5d2621 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -1,4 +1,4 @@ -import { generateText, Tool, Schema, LanguageModelV1 } from "ai"; +import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai"; import { Model } from "./models.js"; const systemPrompt = [ @@ -10,15 +10,32 @@ const systemPrompt = [ 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; -export interface Agent { - prompt(prompt: string, model: M, tools: T): Promise; +// Some necessary types from Vercel SDK +export type VercelMCPClient = Awaited>; +export type VercelMCPClientTools = Awaited>; +export type VercelAgent = ReturnType; + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK +export interface AgentPromptResult { + respondingModel: string; + tokensUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; + text: string; + messages: Record[]; +} +export interface Agent { + prompt(prompt: string, model: Model, tools: Tools): Promise; } export function getVercelToolCallingAgent( requestedSystemPrompt?: string -): Agent, Record>>, { text: string; messages: unknown[] }> { +): Agent, VercelMCPClientTools, AgentPromptResult> { return { - async prompt(prompt: string, model: Model, tools: Record>>) { + async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { const result = await generateText({ model: model.getModel(), system: [...systemPrompt, requestedSystemPrompt].join("\n"), @@ -29,6 +46,8 @@ export function getVercelToolCallingAgent( return { text: result.text, messages: result.response.messages, + respondingModel: result.response.modelId, + tokensUsage: result.usage, }; }, }; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 7a49b550..5670207a 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,8 +1,10 @@ import { TestableModels } from "./models.js"; import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; -import { Agent, getVercelToolCallingAgent } from "./agent.js"; +import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; +import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; +import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -35,10 +37,12 @@ export function describeAccuracyTests( const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; - let agent: Agent; + let agent: VercelAgent; beforeAll(async () => { + accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); }); @@ -50,6 +54,7 @@ export function describeAccuracyTests( }); afterAll(async () => { + await accuracySnapshotStorage.close(); await testMCPClient.close(); }); @@ -62,21 +67,27 @@ export function describeAccuracyTests( const promptForModel = testConfig.injectConnectedAssumption ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") : testConfig.prompt; - const conversation = await agent.prompt(promptForModel, model, toolsForModel); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); const toolCalls = testMCPClient.getToolCalls(); const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( testConfig.expectedToolCalls, toolCalls ); - console.debug(testConfig.prompt); - // console.debug(`Conversation`, JSON.stringify(conversation, null, 2)); - // console.debug(`Tool calls`, JSON.stringify(toolCalls, null, 2)); - console.debug( - "Tool calling accuracy: %s, Parameter Accuracy: %s", + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracySnapshotStorage.createSnapshotEntry({ + requestedModel: model.modelName, + test: suiteName, + prompt: testConfig.prompt, + llmResponseTime: responseTime, toolCallingAccuracy, - parameterMatchingAccuracy - ); + parameterAccuracy: parameterMatchingAccuracy, + ...result, + }); }); }); }); diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts new file mode 100644 index 00000000..03e34a7d --- /dev/null +++ b/tests/accuracy/sdk/git-info.ts @@ -0,0 +1,7 @@ +import { simpleGit } from "simple-git"; + +export async function getCommitSHA(): Promise { + const commitLogs = await simpleGit().log(); + const lastCommit = commitLogs.latest; + return lastCommit?.hash; +} From 94a0fe3a989ff930b812fd8d709a8c69847838ff Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:06:49 +0200 Subject: [PATCH 25/47] chore: remove file based snapshot --- tests/accuracy/sdk/accuracy-snapshot.ts | 54 ------------------------- 1 file changed, 54 deletions(-) delete mode 100644 tests/accuracy/sdk/accuracy-snapshot.ts diff --git a/tests/accuracy/sdk/accuracy-snapshot.ts b/tests/accuracy/sdk/accuracy-snapshot.ts deleted file mode 100644 index 1f7867a9..00000000 --- a/tests/accuracy/sdk/accuracy-snapshot.ts +++ /dev/null @@ -1,54 +0,0 @@ -import fs from "fs/promises"; -import path from "path"; -import { z } from "zod"; - -export const SNAPSHOT_FILE_PATH = path.resolve(process.cwd(), "accuracy-snapshot.json"); - -export const AccuracySnapshotEntrySchema = z.object({ - datetime: z.string(), - commit: z.string(), - model: z.string(), - suite: z.string(), - test: z.string(), - toolCallingAccuracy: z.number(), - parameterAccuracy: z.number(), -}); - -export type AccuracySnapshotEntry = z.infer; - -export async function readSnapshot(): Promise { - try { - const raw = await fs.readFile(SNAPSHOT_FILE_PATH, "utf8"); - return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); - } catch (e: unknown) { - if ((e as { code: string }).code === "ENOENT") { - return []; - } - throw e; - } -} - -function waitFor(ms: number) { - return new Promise((resolve) => setTimeout(resolve, ms)); -} - -export async function appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { - AccuracySnapshotEntrySchema.parse(entry); - - for (let attempt = 0; attempt < 5; attempt++) { - try { - const snapshot = await readSnapshot(); - snapshot.unshift(entry); - const tmp = `${SNAPSHOT_FILE_PATH}~${Date.now()}`; - await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); - await fs.rename(tmp, SNAPSHOT_FILE_PATH); - return; - } catch (e) { - if (attempt < 4) { - await waitFor(100 + Math.random() * 200); - } else { - throw e; - } - } - } -} From 5bc21aafd2605fbac49fd2df12ace7072dab7752 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 11:42:34 +0200 Subject: [PATCH 26/47] wip: snapshot summary generator --- .../accuracy-snapshot-storage/mdb-snapshot-storage.ts | 9 +++++++-- .../sdk/accuracy-snapshot-storage/snapshot-storage.ts | 4 +--- tests/accuracy/sdk/git-info.ts | 5 +++++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index f8296a8a..c93abe12 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -49,7 +49,12 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { await this.snapshotCollection.insertOne(snapshotWithMeta); } - async getLastRunIdForCommit(commit: string): Promise { + async getLatestSnapshotsForCommit(commit: string): Promise { + const latestRunId = await this.getLastRunIdForCommit(commit); + return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; + } + + private async getLastRunIdForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( { commit: commit }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } @@ -58,7 +63,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; } - async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { + private async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index a6f92807..eb0e453f 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -43,9 +43,7 @@ export interface AccuracySnapshotStorage { > ): Promise; - getLastRunIdForCommit(commit: string): Promise; - - getSnapshotEntriesForRunId(accuracyRunId: string): Promise; + getLatestSnapshotsForCommit(commit: string): Promise; close(): Promise; } diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts index 03e34a7d..a0918a6f 100644 --- a/tests/accuracy/sdk/git-info.ts +++ b/tests/accuracy/sdk/git-info.ts @@ -5,3 +5,8 @@ export async function getCommitSHA(): Promise { const lastCommit = commitLogs.latest; return lastCommit?.hash; } + +export async function getMergeBase(targetBranch: string, workBranchOrCommit: string): Promise { + const result = await simpleGit().raw(["merge-base", targetBranch, workBranchOrCommit]); + return result.trim(); +} From 6abc3243028b05f4353d2775079cdd649463e5b3 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:02:17 +0200 Subject: [PATCH 27/47] chore: single entry point for running accuracy tests with different config --- package.json | 3 +-- scripts/run-accuracy-tests.sh | 7 +++++++ 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 scripts/run-accuracy-tests.sh diff --git a/package.json b/package.json index 6f92e3c5..e978f7bf 100644 --- a/package.json +++ b/package.json @@ -30,8 +30,7 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", - "test:accuracy": "node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern tests/accuracy", - "test:accuracy-file": "MDB_ACCURACY_RUN_ID=$(npx uuid v4) node --experimental-vm-modules node_modules/jest/bin/jest.js" + "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh new file mode 100644 index 00000000..979f49e1 --- /dev/null +++ b/scripts/run-accuracy-tests.sh @@ -0,0 +1,7 @@ +#!/bin/sh +# Variables necessary for the accuracy test runs +export MDB_ACCURACY_RUN_ID=$(npx uuid v4) + +TEST_PATH_PATTERN="${1:-tests/accuracy}" +shift || true +node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" \ No newline at end of file From c9c3b3686461eba652979e7143b5c72eac559437 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:02:45 +0200 Subject: [PATCH 28/47] chore: reformat --- .../test-data-dumps/comics.books.json | 331 ++++-------------- .../test-data-dumps/comics.characters.json | 298 ++++------------ .../test-data-dumps/mflix.movies.json | 319 ++++------------- .../accuracy/test-data-dumps/mflix.shows.json | 296 ++++------------ 4 files changed, 255 insertions(+), 989 deletions(-) diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json index 3bcb9ecc..f605f031 100644 --- a/tests/accuracy/test-data-dumps/comics.books.json +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -5,12 +5,8 @@ "publisher": "Dark Horse Comics", "release_date": "2007-03-02T00:00:00", "issues": 118, - "main_characters": [ - "Stephen Shaw" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Stephen Shaw"], + "genre": ["Sci-Fi"] }, { "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", @@ -18,13 +14,8 @@ "publisher": "Image Comics", "release_date": "1998-12-07T00:00:00", "issues": 137, - "main_characters": [ - "Margaret Hogan" - ], - "genre": [ - "Adventure", - "Horror" - ] + "main_characters": ["Margaret Hogan"], + "genre": ["Adventure", "Horror"] }, { "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", @@ -32,13 +23,8 @@ "publisher": "DC Comics", "release_date": "2012-12-01T00:00:00", "issues": 227, - "main_characters": [ - "Joseph Cook", - "Tammy Bishop" - ], - "genre": [ - "Superhero" - ] + "main_characters": ["Joseph Cook", "Tammy Bishop"], + "genre": ["Superhero"] }, { "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", @@ -46,12 +32,8 @@ "publisher": "DC Comics", "release_date": "2011-02-24T00:00:00", "issues": 270, - "main_characters": [ - "Sandra Moss" - ], - "genre": [ - "Fantasy" - ] + "main_characters": ["Sandra Moss"], + "genre": ["Fantasy"] }, { "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", @@ -65,10 +47,7 @@ "Tammy Murphy", "Larry Hensley" ], - "genre": [ - "Adventure", - "Horror" - ] + "genre": ["Adventure", "Horror"] }, { "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", @@ -82,10 +61,7 @@ "Micheal Brown", "Jeremy Rice" ], - "genre": [ - "Fantasy", - "Action" - ] + "genre": ["Fantasy", "Action"] }, { "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", @@ -93,15 +69,8 @@ "publisher": "Marvel Comics", "release_date": "2007-11-19T00:00:00", "issues": 55, - "main_characters": [ - "Joseph Bowman", - "Robert Logan", - "Ashley Watkins" - ], - "genre": [ - "Sci-Fi", - "Horror" - ] + "main_characters": ["Joseph Bowman", "Robert Logan", "Ashley Watkins"], + "genre": ["Sci-Fi", "Horror"] }, { "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", @@ -115,10 +84,7 @@ "Lindsay Anderson", "Scott Garcia" ], - "genre": [ - "Action", - "Horror" - ] + "genre": ["Action", "Horror"] }, { "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", @@ -126,12 +92,8 @@ "publisher": "Marvel Comics", "release_date": "1987-04-16T00:00:00", "issues": 235, - "main_characters": [ - "Julie Goodwin" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Julie Goodwin"], + "genre": ["Sci-Fi"] }, { "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", @@ -139,14 +101,8 @@ "publisher": "Dark Horse Comics", "release_date": "1979-09-13T00:00:00", "issues": 239, - "main_characters": [ - "Chad Pham", - "Lindsay Anderson", - "Carlos Burton" - ], - "genre": [ - "Adventure" - ] + "main_characters": ["Chad Pham", "Lindsay Anderson", "Carlos Burton"], + "genre": ["Adventure"] }, { "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", @@ -154,15 +110,8 @@ "publisher": "Marvel Comics", "release_date": "2023-10-01T00:00:00", "issues": 163, - "main_characters": [ - "Kevin Humphrey", - "Maria Wright", - "Virginia Watts" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Kevin Humphrey", "Maria Wright", "Virginia Watts"], + "genre": ["Fantasy", "Action"] }, { "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", @@ -170,13 +119,8 @@ "publisher": "IDW Publishing", "release_date": "2016-09-28T00:00:00", "issues": 14, - "main_characters": [ - "Brian Vincent" - ], - "genre": [ - "Sci-Fi", - "Fantasy" - ] + "main_characters": ["Brian Vincent"], + "genre": ["Sci-Fi", "Fantasy"] }, { "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", @@ -184,12 +128,8 @@ "publisher": "Image Comics", "release_date": "1970-04-16T00:00:00", "issues": 5, - "main_characters": [ - "Joseph Cook" - ], - "genre": [ - "Fantasy" - ] + "main_characters": ["Joseph Cook"], + "genre": ["Fantasy"] }, { "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", @@ -197,15 +137,8 @@ "publisher": "IDW Publishing", "release_date": "2019-02-15T00:00:00", "issues": 121, - "main_characters": [ - "Angelica Stein", - "Benjamin Morris", - "Jeremy Rice" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Angelica Stein", "Benjamin Morris", "Jeremy Rice"], + "genre": ["Fantasy", "Action"] }, { "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", @@ -219,9 +152,7 @@ "Carlos Burton", "Micheal Brown" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", @@ -235,10 +166,7 @@ "Holly Green", "James Sanchez" ], - "genre": [ - "Sci-Fi", - "Fantasy" - ] + "genre": ["Sci-Fi", "Fantasy"] }, { "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", @@ -246,14 +174,8 @@ "publisher": "DC Comics", "release_date": "2001-03-01T00:00:00", "issues": 176, - "main_characters": [ - "Justin Martinez", - "Tammy Murphy" - ], - "genre": [ - "Action", - "Fantasy" - ] + "main_characters": ["Justin Martinez", "Tammy Murphy"], + "genre": ["Action", "Fantasy"] }, { "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", @@ -261,14 +183,8 @@ "publisher": "DC Comics", "release_date": "1976-09-05T00:00:00", "issues": 68, - "main_characters": [ - "Christopher Elliott", - "Maria Wright" - ], - "genre": [ - "Superhero", - "Adventure" - ] + "main_characters": ["Christopher Elliott", "Maria Wright"], + "genre": ["Superhero", "Adventure"] }, { "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", @@ -282,9 +198,7 @@ "Robert Logan", "Margaret Hogan" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", @@ -292,15 +206,8 @@ "publisher": "Marvel Comics", "release_date": "1976-09-18T00:00:00", "issues": 275, - "main_characters": [ - "Sandra Moss", - "Charles Blair", - "Justin Martinez" - ], - "genre": [ - "Fantasy", - "Action" - ] + "main_characters": ["Sandra Moss", "Charles Blair", "Justin Martinez"], + "genre": ["Fantasy", "Action"] }, { "_id": "da5be16e-13e8-42d5-8954-bd89919395af", @@ -314,10 +221,7 @@ "Cristian Oneal", "Michelle Valdez" ], - "genre": [ - "Horror", - "Fantasy" - ] + "genre": ["Horror", "Fantasy"] }, { "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", @@ -325,15 +229,8 @@ "publisher": "Image Comics", "release_date": "2008-07-21T00:00:00", "issues": 109, - "main_characters": [ - "Holly Green", - "Diana Mata", - "Julie Goodwin" - ], - "genre": [ - "Horror", - "Sci-Fi" - ] + "main_characters": ["Holly Green", "Diana Mata", "Julie Goodwin"], + "genre": ["Horror", "Sci-Fi"] }, { "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", @@ -341,13 +238,8 @@ "publisher": "DC Comics", "release_date": "2012-04-27T00:00:00", "issues": 297, - "main_characters": [ - "Joshua Hicks" - ], - "genre": [ - "Action", - "Horror" - ] + "main_characters": ["Joshua Hicks"], + "genre": ["Action", "Horror"] }, { "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", @@ -355,15 +247,8 @@ "publisher": "Image Comics", "release_date": "1996-02-20T00:00:00", "issues": 295, - "main_characters": [ - "Margaret Hogan", - "Christopher Elliott", - "Joseph Cook" - ], - "genre": [ - "Fantasy", - "Adventure" - ] + "main_characters": ["Margaret Hogan", "Christopher Elliott", "Joseph Cook"], + "genre": ["Fantasy", "Adventure"] }, { "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", @@ -377,9 +262,7 @@ "Julie Goodwin", "Charles Blair" ], - "genre": [ - "Action" - ] + "genre": ["Action"] }, { "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", @@ -387,13 +270,8 @@ "publisher": "IDW Publishing", "release_date": "2024-06-19T00:00:00", "issues": 259, - "main_characters": [ - "Debbie Green" - ], - "genre": [ - "Sci-Fi", - "Superhero" - ] + "main_characters": ["Debbie Green"], + "genre": ["Sci-Fi", "Superhero"] }, { "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", @@ -407,9 +285,7 @@ "Brian Vincent", "Sandra Moss" ], - "genre": [ - "Adventure" - ] + "genre": ["Adventure"] }, { "_id": "71b845f3-4416-430a-81eb-8c208f824365", @@ -423,10 +299,7 @@ "Holly Green", "Joseph Bowman" ], - "genre": [ - "Superhero", - "Fantasy" - ] + "genre": ["Superhero", "Fantasy"] }, { "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", @@ -434,13 +307,8 @@ "publisher": "DC Comics", "release_date": "1969-11-30T00:00:00", "issues": 104, - "main_characters": [ - "Micheal Brown" - ], - "genre": [ - "Horror", - "Superhero" - ] + "main_characters": ["Micheal Brown"], + "genre": ["Horror", "Superhero"] }, { "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", @@ -448,12 +316,8 @@ "publisher": "Image Comics", "release_date": "1990-01-24T00:00:00", "issues": 74, - "main_characters": [ - "Robert Logan" - ], - "genre": [ - "Sci-Fi" - ] + "main_characters": ["Robert Logan"], + "genre": ["Sci-Fi"] }, { "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", @@ -461,15 +325,8 @@ "publisher": "DC Comics", "release_date": "1971-04-21T00:00:00", "issues": 135, - "main_characters": [ - "Jeremy Rice", - "Elizabeth Robinson", - "James Sanchez" - ], - "genre": [ - "Action", - "Sci-Fi" - ] + "main_characters": ["Jeremy Rice", "Elizabeth Robinson", "James Sanchez"], + "genre": ["Action", "Sci-Fi"] }, { "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", @@ -477,15 +334,8 @@ "publisher": "Dark Horse Comics", "release_date": "1984-06-24T00:00:00", "issues": 111, - "main_characters": [ - "Joshua Hicks", - "Jeremy Rice", - "Micheal Brown" - ], - "genre": [ - "Fantasy", - "Superhero" - ] + "main_characters": ["Joshua Hicks", "Jeremy Rice", "Micheal Brown"], + "genre": ["Fantasy", "Superhero"] }, { "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", @@ -493,14 +343,8 @@ "publisher": "DC Comics", "release_date": "2013-05-22T00:00:00", "issues": 13, - "main_characters": [ - "Luis Callahan", - "Tammy Bishop", - "Cynthia Brown" - ], - "genre": [ - "Action" - ] + "main_characters": ["Luis Callahan", "Tammy Bishop", "Cynthia Brown"], + "genre": ["Action"] }, { "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", @@ -508,12 +352,8 @@ "publisher": "DC Comics", "release_date": "2021-12-03T00:00:00", "issues": 129, - "main_characters": [ - "Margaret Hogan" - ], - "genre": [ - "Action" - ] + "main_characters": ["Margaret Hogan"], + "genre": ["Action"] }, { "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", @@ -521,13 +361,8 @@ "publisher": "Dark Horse Comics", "release_date": "2001-08-02T00:00:00", "issues": 38, - "main_characters": [ - "James Sanchez", - "Larry Hensley" - ], - "genre": [ - "Superhero" - ] + "main_characters": ["James Sanchez", "Larry Hensley"], + "genre": ["Superhero"] }, { "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", @@ -535,13 +370,8 @@ "publisher": "Image Comics", "release_date": "2005-03-30T00:00:00", "issues": 150, - "main_characters": [ - "Carlos Burton" - ], - "genre": [ - "Superhero", - "Fantasy" - ] + "main_characters": ["Carlos Burton"], + "genre": ["Superhero", "Fantasy"] }, { "_id": "88904f06-50a6-44f1-bccc-f379a9788611", @@ -549,13 +379,8 @@ "publisher": "Image Comics", "release_date": "2021-06-27T00:00:00", "issues": 262, - "main_characters": [ - "Luis Callahan" - ], - "genre": [ - "Sci-Fi", - "Superhero" - ] + "main_characters": ["Luis Callahan"], + "genre": ["Sci-Fi", "Superhero"] }, { "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", @@ -563,14 +388,8 @@ "publisher": "IDW Publishing", "release_date": "1969-06-03T00:00:00", "issues": 264, - "main_characters": [ - "Scott Garcia", - "Joseph Bowman" - ], - "genre": [ - "Fantasy", - "Superhero" - ] + "main_characters": ["Scott Garcia", "Joseph Bowman"], + "genre": ["Fantasy", "Superhero"] }, { "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", @@ -584,10 +403,7 @@ "Benjamin Morris", "Virginia Watts" ], - "genre": [ - "Adventure", - "Action" - ] + "genre": ["Adventure", "Action"] }, { "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", @@ -595,14 +411,7 @@ "publisher": "IDW Publishing", "release_date": "2007-12-27T00:00:00", "issues": 117, - "main_characters": [ - "Debbie Green", - "Christopher Elliott", - "Joshua Hicks" - ], - "genre": [ - "Sci-Fi", - "Action" - ] + "main_characters": ["Debbie Green", "Christopher Elliott", "Joshua Hicks"], + "genre": ["Sci-Fi", "Action"] } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json index 944c33d5..4a255f48 100644 --- a/tests/accuracy/test-data-dumps/comics.characters.json +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -3,16 +3,9 @@ "_id": "d7047787-abea-40fa-b78e-939925fd3589", "name": "Elizabeth Robinson", "alias": "ashley62", - "powers": [ - "Shapeshifting", - "Telepathy", - "Flight" - ], + "powers": ["Shapeshifting", "Telepathy", "Flight"], "first_appearance": "1961-06-23T00:00:00", - "affiliations": [ - "Fantastic Four", - "X-Men" - ], + "affiliations": ["Fantastic Four", "X-Men"], "origin": "Earth", "is_villain": false }, @@ -20,14 +13,9 @@ "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", "name": "Joshua Wang", "alias": "paulasmith", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "1987-04-16T00:00:00", - "affiliations": [ - "Fantastic Four", - "Justice League" - ], + "affiliations": ["Fantastic Four", "Justice League"], "origin": "Earth", "is_villain": true }, @@ -35,10 +23,7 @@ "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", "name": "Stephen Shaw", "alias": "adamskenneth", - "powers": [ - "Super Speed", - "Flight" - ], + "powers": ["Super Speed", "Flight"], "first_appearance": "2004-07-26T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -48,14 +33,9 @@ "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", "name": "Joseph Bowman", "alias": "amysalazar", - "powers": [ - "Time Manipulation" - ], + "powers": ["Time Manipulation"], "first_appearance": "1961-07-03T00:00:00", - "affiliations": [ - "Teen Titans", - "Avengers" - ], + "affiliations": ["Teen Titans", "Avengers"], "origin": "Atlantis", "is_villain": true }, @@ -63,10 +43,7 @@ "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", "name": "Debbie Green", "alias": "steventodd", - "powers": [ - "Energy Blasts", - "Regeneration" - ], + "powers": ["Energy Blasts", "Regeneration"], "first_appearance": "2021-12-05T00:00:00", "affiliations": [], "origin": "Asgard", @@ -76,11 +53,7 @@ "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", "name": "Christopher Elliott", "alias": "barajasmitchell", - "powers": [ - "Flight", - "Invisibility", - "Telekinesis" - ], + "powers": ["Flight", "Invisibility", "Telekinesis"], "first_appearance": "1947-03-23T00:00:00", "affiliations": [], "origin": "Earth", @@ -90,10 +63,7 @@ "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", "name": "Tammy Murphy", "alias": "jessicagill", - "powers": [ - "Super Strength", - "Telekinesis" - ], + "powers": ["Super Strength", "Telekinesis"], "first_appearance": "2000-07-06T00:00:00", "affiliations": [], "origin": "Mutant", @@ -103,10 +73,7 @@ "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", "name": "Scott Garcia", "alias": "whitechristie", - "powers": [ - "Telepathy", - "Energy Blasts" - ], + "powers": ["Telepathy", "Energy Blasts"], "first_appearance": "2000-11-22T00:00:00", "affiliations": [], "origin": "Asgard", @@ -116,14 +83,9 @@ "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", "name": "Julie Goodwin", "alias": "robertsmith", - "powers": [ - "Telepathy", - "Super Speed" - ], + "powers": ["Telepathy", "Super Speed"], "first_appearance": "1953-08-09T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -131,11 +93,7 @@ "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", "name": "Joshua Hicks", "alias": "cynthia32", - "powers": [ - "Super Strength", - "Invisibility", - "Telekinesis" - ], + "powers": ["Super Strength", "Invisibility", "Telekinesis"], "first_appearance": "1967-07-17T00:00:00", "affiliations": [], "origin": "Krypton", @@ -145,14 +103,9 @@ "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", "name": "Justin Martinez", "alias": "janicebrown", - "powers": [ - "Super Speed", - "Super Strength" - ], + "powers": ["Super Speed", "Super Strength"], "first_appearance": "1973-09-19T00:00:00", - "affiliations": [ - "Avengers" - ], + "affiliations": ["Avengers"], "origin": "Mutant", "is_villain": true }, @@ -160,10 +113,7 @@ "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", "name": "Holly Green", "alias": "ystanley", - "powers": [ - "Shapeshifting", - "Energy Blasts" - ], + "powers": ["Shapeshifting", "Energy Blasts"], "first_appearance": "2013-08-05T00:00:00", "affiliations": [], "origin": "Krypton", @@ -173,15 +123,9 @@ "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", "name": "Margaret Hogan", "alias": "wendyconway", - "powers": [ - "Super Speed", - "Telepathy" - ], + "powers": ["Super Speed", "Telepathy"], "first_appearance": "1944-08-13T00:00:00", - "affiliations": [ - "Justice League", - "X-Men" - ], + "affiliations": ["Justice League", "X-Men"], "origin": "Earth", "is_villain": false }, @@ -189,14 +133,9 @@ "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", "name": "Ashley Watkins", "alias": "cjohnson", - "powers": [ - "Shapeshifting" - ], + "powers": ["Shapeshifting"], "first_appearance": "1940-09-13T00:00:00", - "affiliations": [ - "Fantastic Four", - "Guardians of the Galaxy" - ], + "affiliations": ["Fantastic Four", "Guardians of the Galaxy"], "origin": "Mutant", "is_villain": true }, @@ -204,14 +143,9 @@ "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", "name": "Tammy Bishop", "alias": "geoffreyryan", - "powers": [ - "Regeneration" - ], + "powers": ["Regeneration"], "first_appearance": "1984-11-04T00:00:00", - "affiliations": [ - "Fantastic Four", - "X-Men" - ], + "affiliations": ["Fantastic Four", "X-Men"], "origin": "Earth", "is_villain": true }, @@ -219,14 +153,9 @@ "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", "name": "Michelle Valdez", "alias": "manuelcobb", - "powers": [ - "Regeneration", - "Energy Blasts" - ], + "powers": ["Regeneration", "Energy Blasts"], "first_appearance": "2014-08-04T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": false }, @@ -234,10 +163,7 @@ "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", "name": "Joseph Cook", "alias": "scott40", - "powers": [ - "Telepathy", - "Telekinesis" - ], + "powers": ["Telepathy", "Telekinesis"], "first_appearance": "1976-04-01T00:00:00", "affiliations": [], "origin": "Earth", @@ -247,9 +173,7 @@ "_id": "0738b98f-4699-4609-9156-fb6a1085a503", "name": "Jeremy Rice", "alias": "james82", - "powers": [ - "Invisibility" - ], + "powers": ["Invisibility"], "first_appearance": "1977-09-22T00:00:00", "affiliations": [], "origin": "Asgard", @@ -259,13 +183,9 @@ "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", "name": "Chad Pham", "alias": "smithjennifer", - "powers": [ - "Telepathy" - ], + "powers": ["Telepathy"], "first_appearance": "2001-05-26T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mars", "is_villain": false }, @@ -273,11 +193,7 @@ "_id": "d545ec48-680c-4493-8650-d759bedabb7e", "name": "Diana Mata", "alias": "zwilliamson", - "powers": [ - "Super Speed", - "Energy Blasts", - "Invisibility" - ], + "powers": ["Super Speed", "Energy Blasts", "Invisibility"], "first_appearance": "2010-11-21T00:00:00", "affiliations": [], "origin": "Mars", @@ -287,15 +203,9 @@ "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", "name": "Maria Wright", "alias": "yraymond", - "powers": [ - "Flight", - "Telepathy" - ], + "powers": ["Flight", "Telepathy"], "first_appearance": "1971-04-15T00:00:00", - "affiliations": [ - "Avengers", - "Teen Titans" - ], + "affiliations": ["Avengers", "Teen Titans"], "origin": "Asgard", "is_villain": true }, @@ -303,15 +213,9 @@ "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", "name": "Carlos Burton", "alias": "rperkins", - "powers": [ - "Super Speed", - "Time Manipulation", - "Telekinesis" - ], + "powers": ["Super Speed", "Time Manipulation", "Telekinesis"], "first_appearance": "1970-01-20T00:00:00", - "affiliations": [ - "Teen Titans" - ], + "affiliations": ["Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -319,10 +223,7 @@ "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", "name": "Lindsay Anderson", "alias": "amycox", - "powers": [ - "Super Strength", - "Telekinesis" - ], + "powers": ["Super Strength", "Telekinesis"], "first_appearance": "1976-04-30T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -332,16 +233,9 @@ "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", "name": "Larry Hensley", "alias": "ylester", - "powers": [ - "Super Strength", - "Invisibility", - "Shapeshifting" - ], + "powers": ["Super Strength", "Invisibility", "Shapeshifting"], "first_appearance": "2019-01-21T00:00:00", - "affiliations": [ - "Guardians of the Galaxy", - "Avengers" - ], + "affiliations": ["Guardians of the Galaxy", "Avengers"], "origin": "Asgard", "is_villain": false }, @@ -349,10 +243,7 @@ "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", "name": "Sandra Moss", "alias": "alexandra81", - "powers": [ - "Telekinesis", - "Super Speed" - ], + "powers": ["Telekinesis", "Super Speed"], "first_appearance": "1989-07-28T00:00:00", "affiliations": [], "origin": "Earth", @@ -362,14 +253,9 @@ "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", "name": "Cynthia Brown", "alias": "freed", - "powers": [ - "Super Strength", - "Energy Blasts" - ], + "powers": ["Super Strength", "Energy Blasts"], "first_appearance": "2015-06-19T00:00:00", - "affiliations": [ - "Fantastic Four" - ], + "affiliations": ["Fantastic Four"], "origin": "Mars", "is_villain": false }, @@ -377,11 +263,7 @@ "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", "name": "Brian Vincent", "alias": "ghowell", - "powers": [ - "Invisibility", - "Flight", - "Super Speed" - ], + "powers": ["Invisibility", "Flight", "Super Speed"], "first_appearance": "2012-05-12T00:00:00", "affiliations": [], "origin": "Asgard", @@ -391,16 +273,9 @@ "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", "name": "Kevin Humphrey", "alias": "mary44", - "powers": [ - "Super Strength", - "Super Speed", - "Telepathy" - ], + "powers": ["Super Strength", "Super Speed", "Telepathy"], "first_appearance": "1993-05-10T00:00:00", - "affiliations": [ - "Justice League", - "Teen Titans" - ], + "affiliations": ["Justice League", "Teen Titans"], "origin": "Mutant", "is_villain": true }, @@ -408,13 +283,9 @@ "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", "name": "Luis Callahan", "alias": "ashleyreeves", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "1943-11-02T00:00:00", - "affiliations": [ - "X-Men" - ], + "affiliations": ["X-Men"], "origin": "Krypton", "is_villain": false }, @@ -422,11 +293,7 @@ "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", "name": "Micheal Brown", "alias": "lisa85", - "powers": [ - "Telepathy", - "Flight", - "Time Manipulation" - ], + "powers": ["Telepathy", "Flight", "Time Manipulation"], "first_appearance": "1983-11-04T00:00:00", "affiliations": [], "origin": "Krypton", @@ -436,14 +303,9 @@ "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", "name": "James Sanchez", "alias": "mary95", - "powers": [ - "Energy Blasts", - "Telekinesis" - ], + "powers": ["Energy Blasts", "Telekinesis"], "first_appearance": "1999-05-20T00:00:00", - "affiliations": [ - "Justice League" - ], + "affiliations": ["Justice League"], "origin": "Atlantis", "is_villain": false }, @@ -451,16 +313,9 @@ "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", "name": "Richard Cooper", "alias": "james85", - "powers": [ - "Telekinesis", - "Energy Blasts", - "Super Speed" - ], + "powers": ["Telekinesis", "Energy Blasts", "Super Speed"], "first_appearance": "2021-11-27T00:00:00", - "affiliations": [ - "Justice League", - "Fantastic Four" - ], + "affiliations": ["Justice League", "Fantastic Four"], "origin": "Mars", "is_villain": true }, @@ -468,9 +323,7 @@ "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", "name": "Charles Blair", "alias": "barbara60", - "powers": [ - "Super Strength" - ], + "powers": ["Super Strength"], "first_appearance": "2012-05-03T00:00:00", "affiliations": [], "origin": "Krypton", @@ -480,9 +333,7 @@ "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", "name": "Virginia Watts", "alias": "klane", - "powers": [ - "Telekinesis" - ], + "powers": ["Telekinesis"], "first_appearance": "2016-04-27T00:00:00", "affiliations": [], "origin": "Earth", @@ -492,9 +343,7 @@ "_id": "495f64a9-123e-46d4-9ddb-21692353a849", "name": "Robert Logan", "alias": "griffinsean", - "powers": [ - "Telepathy" - ], + "powers": ["Telepathy"], "first_appearance": "2003-07-16T00:00:00", "affiliations": [], "origin": "Krypton", @@ -504,10 +353,7 @@ "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", "name": "Cheyenne Powell", "alias": "laurenolsen", - "powers": [ - "Time Manipulation", - "Energy Blasts" - ], + "powers": ["Time Manipulation", "Energy Blasts"], "first_appearance": "1964-02-05T00:00:00", "affiliations": [], "origin": "Atlantis", @@ -517,16 +363,9 @@ "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", "name": "Benjamin Morris", "alias": "sierra18", - "powers": [ - "Telekinesis", - "Regeneration", - "Shapeshifting" - ], + "powers": ["Telekinesis", "Regeneration", "Shapeshifting"], "first_appearance": "1964-09-27T00:00:00", - "affiliations": [ - "X-Men", - "Avengers" - ], + "affiliations": ["X-Men", "Avengers"], "origin": "Mars", "is_villain": false }, @@ -534,9 +373,7 @@ "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", "name": "Cristian Oneal", "alias": "harrellamy", - "powers": [ - "Super Speed" - ], + "powers": ["Super Speed"], "first_appearance": "1965-01-29T00:00:00", "affiliations": [], "origin": "Mutant", @@ -546,16 +383,9 @@ "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", "name": "Jessica Vargas", "alias": "chadherrera", - "powers": [ - "Energy Blasts", - "Super Strength", - "Telekinesis" - ], + "powers": ["Energy Blasts", "Super Strength", "Telekinesis"], "first_appearance": "1974-03-29T00:00:00", - "affiliations": [ - "X-Men", - "Teen Titans" - ], + "affiliations": ["X-Men", "Teen Titans"], "origin": "Earth", "is_villain": true }, @@ -563,14 +393,10 @@ "_id": "f3fa712d-2124-433a-b405-c02757fa1503", "name": "Angelica Stein", "alias": "reedjason", - "powers": [ - "Invisibility" - ], + "powers": ["Invisibility"], "first_appearance": "1981-01-02T00:00:00", - "affiliations": [ - "Avengers" - ], + "affiliations": ["Avengers"], "origin": "Earth", "is_villain": true } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json index cd35382e..3c492185 100644 --- a/tests/accuracy/test-data-dumps/mflix.movies.json +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -3,16 +3,9 @@ "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", "title": "Human sell", "release_year": 1993, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Christina Collins", - "cast": [ - "Jeremy Marks", - "Matthew Moore", - "Erica Miller", - "Beth Morales" - ], + "cast": ["Jeremy Marks", "Matthew Moore", "Erica Miller", "Beth Morales"], "runtime": 139, "rating": 9.3 }, @@ -20,10 +13,7 @@ "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", "title": "Trial we much", "release_year": 2020, - "genres": [ - "Horror", - "Comedy" - ], + "genres": ["Horror", "Comedy"], "director": "Steven Miles", "cast": [ "Patrick Huynh", @@ -38,10 +28,7 @@ "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", "title": "Someone", "release_year": 1996, - "genres": [ - "Action", - "Horror" - ], + "genres": ["Action", "Horror"], "director": "Steven Miles", "cast": [ "Carrie Cummings", @@ -57,9 +44,7 @@ "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", "title": "Without our", "release_year": 2012, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", "cast": [ "Rodney Gray", @@ -75,16 +60,9 @@ "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", "title": "Cost anything", "release_year": 2002, - "genres": [ - "Romance", - "Action" - ], + "genres": ["Romance", "Action"], "director": "Bryan Andrews", - "cast": [ - "Gregory Mullins", - "Jillian Arroyo", - "Angela Reed" - ], + "cast": ["Gregory Mullins", "Jillian Arroyo", "Angela Reed"], "runtime": 112, "rating": 3.8 }, @@ -92,9 +70,7 @@ "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", "title": "Hold green energy their", "release_year": 1989, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", "cast": [ "Eduardo Carey", @@ -109,10 +85,7 @@ "_id": "1b81c45b-1d09-47dc-871f-ace109107446", "title": "Choose ability start", "release_year": 1990, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Bryan Andrews", "cast": [ "Tyler Daniels", @@ -127,15 +100,9 @@ "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", "title": "Cover perhaps", "release_year": 2022, - "genres": [ - "Drama" - ], + "genres": ["Drama"], "director": "Daniel Wallace", - "cast": [ - "Victoria Price", - "Holly Ross", - "Michele Jones" - ], + "cast": ["Victoria Price", "Holly Ross", "Michele Jones"], "runtime": 173, "rating": 4.3 }, @@ -143,15 +110,9 @@ "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", "title": "Policy particularly", "release_year": 2003, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Brittany Parker", - "cast": [ - "Emily Haynes", - "Crystal Johnson", - "Ernest Jones" - ], + "cast": ["Emily Haynes", "Crystal Johnson", "Ernest Jones"], "runtime": 154, "rating": 6.6 }, @@ -159,10 +120,7 @@ "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", "title": "Store care", "release_year": 2017, - "genres": [ - "Romance", - "Sci-Fi" - ], + "genres": ["Romance", "Sci-Fi"], "director": "Sara Stewart", "cast": [ "Katherine Matthews", @@ -178,10 +136,7 @@ "_id": "99e75e60-6466-4314-92c3-00c433a06600", "title": "Section close bad", "release_year": 2024, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Bryan Andrews", "cast": [ "Heather Marshall", @@ -196,16 +151,9 @@ "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", "title": "Become stand", "release_year": 2001, - "genres": [ - "Sci-Fi", - "Thriller" - ], + "genres": ["Sci-Fi", "Thriller"], "director": "Brian Martinez", - "cast": [ - "Robert Ross", - "Kimberly Williamson", - "Pam Wyatt" - ], + "cast": ["Robert Ross", "Kimberly Williamson", "Pam Wyatt"], "runtime": 162, "rating": 1.5 }, @@ -213,10 +161,7 @@ "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", "title": "I case", "release_year": 2012, - "genres": [ - "Drama", - "Comedy" - ], + "genres": ["Drama", "Comedy"], "director": "Brittany Parker", "cast": [ "Justin Davis", @@ -231,15 +176,9 @@ "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", "title": "No organization style", "release_year": 2013, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", - "cast": [ - "Benjamin Whitney", - "Joseph Bush", - "Barbara Griffin" - ], + "cast": ["Benjamin Whitney", "Joseph Bush", "Barbara Griffin"], "runtime": 167, "rating": 9.6 }, @@ -247,15 +186,9 @@ "_id": "15855c7b-ece2-4238-b995-57f6207509ea", "title": "Computer garden", "release_year": 2012, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Steven Miles", - "cast": [ - "Darlene Lee", - "Tina Wang", - "Nathan Mayo" - ], + "cast": ["Darlene Lee", "Tina Wang", "Nathan Mayo"], "runtime": 146, "rating": 6.5 }, @@ -263,15 +196,9 @@ "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", "title": "Trip information feel", "release_year": 2008, - "genres": [ - "Action", - "Thriller" - ], + "genres": ["Action", "Thriller"], "director": "Brittany Parker", - "cast": [ - "Kelly Walsh", - "Michael Rocha" - ], + "cast": ["Kelly Walsh", "Michael Rocha"], "runtime": 148, "rating": 9.8 }, @@ -279,9 +206,7 @@ "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", "title": "It project low part", "release_year": 1992, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", "cast": [ "Sheena Murphy", @@ -297,9 +222,7 @@ "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", "title": "Near attorney discuss", "release_year": 1983, - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "director": "Christina Collins", "cast": [ "Chase Myers", @@ -314,16 +237,9 @@ "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", "title": "Whether know", "release_year": 2009, - "genres": [ - "Comedy", - "Thriller" - ], + "genres": ["Comedy", "Thriller"], "director": "Bryan Andrews", - "cast": [ - "Amy Reed", - "William Williams", - "Steven Lawrence" - ], + "cast": ["Amy Reed", "William Williams", "Steven Lawrence"], "runtime": 134, "rating": 9.6 }, @@ -331,10 +247,7 @@ "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", "title": "Against place", "release_year": 2017, - "genres": [ - "Drama", - "Romance" - ], + "genres": ["Drama", "Romance"], "director": "Daniel Wallace", "cast": [ "Brittany Thompson", @@ -350,16 +263,9 @@ "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", "title": "Return yard", "release_year": 1994, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", - "cast": [ - "Mason Lara", - "Taylor Salinas", - "Tim Foster", - "Erin Sharp" - ], + "cast": ["Mason Lara", "Taylor Salinas", "Tim Foster", "Erin Sharp"], "runtime": 99, "rating": 8.8 }, @@ -367,9 +273,7 @@ "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", "title": "Certain fish", "release_year": 2009, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Steven Miles", "cast": [ "Jonathan King", @@ -384,9 +288,7 @@ "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", "title": "Agreement like program", "release_year": 2004, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Daniel Jackson", "cast": [ "Ashley Green", @@ -402,14 +304,9 @@ "_id": "791688be-4358-45ab-956e-71fe3fd35d19", "title": "Floor seven then", "release_year": 2009, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Daniel Wallace", - "cast": [ - "Dustin Wright", - "Crystal Young" - ], + "cast": ["Dustin Wright", "Crystal Young"], "runtime": 143, "rating": 4.8 }, @@ -417,16 +314,9 @@ "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", "title": "Like rather paper", "release_year": 2006, - "genres": [ - "Drama" - ], + "genres": ["Drama"], "director": "Spencer Gillespie", - "cast": [ - "Sean Moyer", - "James Edwards", - "Tara Lee", - "Robert Scott" - ], + "cast": ["Sean Moyer", "James Edwards", "Tara Lee", "Robert Scott"], "runtime": 175, "rating": 9.1 }, @@ -434,10 +324,7 @@ "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", "title": "Argue hospital", "release_year": 1994, - "genres": [ - "Romance", - "Sci-Fi" - ], + "genres": ["Romance", "Sci-Fi"], "director": "Amanda Young", "cast": [ "Carolyn Williams", @@ -453,15 +340,9 @@ "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", "title": "Become after card", "release_year": 1986, - "genres": [ - "Sci-Fi", - "Horror" - ], + "genres": ["Sci-Fi", "Horror"], "director": "Brian Martinez", - "cast": [ - "Rhonda Ochoa", - "Charlene Castillo" - ], + "cast": ["Rhonda Ochoa", "Charlene Castillo"], "runtime": 100, "rating": 8.5 }, @@ -469,14 +350,9 @@ "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", "title": "Born authority attention", "release_year": 1994, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Brian Martinez", - "cast": [ - "Matthew Thomas", - "Carly Perkins" - ], + "cast": ["Matthew Thomas", "Carly Perkins"], "runtime": 131, "rating": 4.9 }, @@ -484,15 +360,9 @@ "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", "title": "Local seven media", "release_year": 1998, - "genres": [ - "Sci-Fi", - "Drama" - ], + "genres": ["Sci-Fi", "Drama"], "director": "Amanda Young", - "cast": [ - "Jessica Perez", - "Larry Atkinson" - ], + "cast": ["Jessica Perez", "Larry Atkinson"], "runtime": 95, "rating": 2.0 }, @@ -500,14 +370,9 @@ "_id": "498597d2-3254-46ef-a800-f322a86fbd55", "title": "Keep employee", "release_year": 1981, - "genres": [ - "Horror" - ], + "genres": ["Horror"], "director": "Christina Collins", - "cast": [ - "Alexis Carlson", - "Andrew Stewart" - ], + "cast": ["Alexis Carlson", "Andrew Stewart"], "runtime": 161, "rating": 6.0 }, @@ -515,15 +380,9 @@ "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", "title": "American question generation", "release_year": 1986, - "genres": [ - "Romance" - ], + "genres": ["Romance"], "director": "Daniel Jackson", - "cast": [ - "Troy Carter", - "Peter Hernandez", - "Christine Brown" - ], + "cast": ["Troy Carter", "Peter Hernandez", "Christine Brown"], "runtime": 176, "rating": 8.0 }, @@ -531,16 +390,9 @@ "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", "title": "Maintain out", "release_year": 2000, - "genres": [ - "Sci-Fi", - "Action" - ], + "genres": ["Sci-Fi", "Action"], "director": "Brian Martinez", - "cast": [ - "Nancy Evans", - "Michael Gill", - "Justin Carroll" - ], + "cast": ["Nancy Evans", "Michael Gill", "Justin Carroll"], "runtime": 179, "rating": 10.0 }, @@ -548,10 +400,7 @@ "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", "title": "Ten box study", "release_year": 2011, - "genres": [ - "Horror", - "Romance" - ], + "genres": ["Horror", "Romance"], "director": "Steven Miles", "cast": [ "Mark Hicks", @@ -566,17 +415,9 @@ "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", "title": "Production operation", "release_year": 2014, - "genres": [ - "Horror", - "Romance" - ], + "genres": ["Horror", "Romance"], "director": "Sara Stewart", - "cast": [ - "Ashley Mata", - "Mark Kelly", - "John West", - "Harold Day" - ], + "cast": ["Ashley Mata", "Mark Kelly", "John West", "Harold Day"], "runtime": 125, "rating": 4.1 }, @@ -584,9 +425,7 @@ "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", "title": "What language", "release_year": 2004, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Sara Stewart", "cast": [ "Scott Mckenzie", @@ -602,16 +441,9 @@ "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", "title": "Up usually central", "release_year": 2011, - "genres": [ - "Sci-Fi", - "Comedy" - ], + "genres": ["Sci-Fi", "Comedy"], "director": "Daniel Jackson", - "cast": [ - "Jennifer Carlson", - "Jonathan Stewart DDS", - "Amy Lester" - ], + "cast": ["Jennifer Carlson", "Jonathan Stewart DDS", "Amy Lester"], "runtime": 159, "rating": 5.6 }, @@ -619,17 +451,9 @@ "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", "title": "For boy only", "release_year": 1987, - "genres": [ - "Thriller", - "Action" - ], + "genres": ["Thriller", "Action"], "director": "Sara Stewart", - "cast": [ - "Gene Smith", - "Robert Osborne Jr.", - "Laura Fox", - "Alexis Lowe" - ], + "cast": ["Gene Smith", "Robert Osborne Jr.", "Laura Fox", "Alexis Lowe"], "runtime": 95, "rating": 3.6 }, @@ -637,9 +461,7 @@ "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", "title": "Site win including your", "release_year": 2008, - "genres": [ - "Sci-Fi" - ], + "genres": ["Sci-Fi"], "director": "Spencer Gillespie", "cast": [ "John Williams", @@ -655,15 +477,9 @@ "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", "title": "Sell huge hair", "release_year": 1997, - "genres": [ - "Thriller", - "Action" - ], + "genres": ["Thriller", "Action"], "director": "Bryan Andrews", - "cast": [ - "Thomas Johnson", - "Ryan Morrow" - ], + "cast": ["Thomas Johnson", "Ryan Morrow"], "runtime": 157, "rating": 4.4 }, @@ -671,17 +487,10 @@ "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", "title": "Guy rest", "release_year": 1997, - "genres": [ - "Sci-Fi", - "Horror" - ], + "genres": ["Sci-Fi", "Horror"], "director": "Steven Miles", - "cast": [ - "Michael Fox", - "Tyler Acosta", - "Tracy Adams" - ], + "cast": ["Michael Fox", "Tyler Acosta", "Tracy Adams"], "runtime": 122, "rating": 7.8 } -] \ No newline at end of file +] diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json index e91c26bb..2edc7fa7 100644 --- a/tests/accuracy/test-data-dumps/mflix.shows.json +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -5,9 +5,7 @@ "seasons": 8, "episodes": 62, "platform": "Amazon Prime", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Roger Gomez", "Sandra Williams", @@ -25,14 +23,8 @@ "seasons": 4, "episodes": 108, "platform": "Hulu", - "genres": [ - "Thriller" - ], - "cast": [ - "Joseph Holmes", - "Patrick Smith", - "Charles Delacruz" - ], + "genres": ["Thriller"], + "cast": ["Joseph Holmes", "Patrick Smith", "Charles Delacruz"], "start_year": 2001, "end_year": null }, @@ -42,10 +34,7 @@ "seasons": 6, "episodes": 49, "platform": "HBO", - "genres": [ - "Comedy", - "Documentary" - ], + "genres": ["Comedy", "Documentary"], "cast": [ "Jason Castillo", "Jessica Burke", @@ -62,15 +51,8 @@ "seasons": 5, "episodes": 23, "platform": "Amazon Prime", - "genres": [ - "Comedy", - "Thriller" - ], - "cast": [ - "Mark Allen", - "Anthony Snyder", - "Kimberly Jones" - ], + "genres": ["Comedy", "Thriller"], + "cast": ["Mark Allen", "Anthony Snyder", "Kimberly Jones"], "start_year": 2002, "end_year": null }, @@ -80,16 +62,8 @@ "seasons": 1, "episodes": 12, "platform": "Amazon Prime", - "genres": [ - "Crime", - "Documentary" - ], - "cast": [ - "Matthew Green", - "Kelly Wright", - "Tonya Sullivan", - "Daniel Brown" - ], + "genres": ["Crime", "Documentary"], + "cast": ["Matthew Green", "Kelly Wright", "Tonya Sullivan", "Daniel Brown"], "start_year": 2009, "end_year": 2020 }, @@ -99,14 +73,8 @@ "seasons": 10, "episodes": 76, "platform": "Amazon Prime", - "genres": [ - "Drama" - ], - "cast": [ - "Stacey Shaw", - "Zachary Steele", - "Laurie Martinez" - ], + "genres": ["Drama"], + "cast": ["Stacey Shaw", "Zachary Steele", "Laurie Martinez"], "start_year": 2011, "end_year": 2020 }, @@ -116,15 +84,8 @@ "seasons": 5, "episodes": 73, "platform": "HBO", - "genres": [ - "Thriller" - ], - "cast": [ - "Diane Boyd", - "Anna Rubio", - "Cheryl Fisher", - "Tyler Villa" - ], + "genres": ["Thriller"], + "cast": ["Diane Boyd", "Anna Rubio", "Cheryl Fisher", "Tyler Villa"], "start_year": 2008, "end_year": 2020 }, @@ -134,9 +95,7 @@ "seasons": 2, "episodes": 114, "platform": "Amazon Prime", - "genres": [ - "Fantasy" - ], + "genres": ["Fantasy"], "cast": [ "Kathleen Marshall", "Kimberly Quinn", @@ -154,9 +113,7 @@ "seasons": 3, "episodes": 55, "platform": "Disney+", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Barbara Clark", "Carolyn Scott", @@ -173,16 +130,8 @@ "seasons": 4, "episodes": 61, "platform": "Amazon Prime", - "genres": [ - "Comedy", - "Fantasy" - ], - "cast": [ - "Adam Lin", - "Evan Smith", - "Christine Howard", - "Ruben Hopkins" - ], + "genres": ["Comedy", "Fantasy"], + "cast": ["Adam Lin", "Evan Smith", "Christine Howard", "Ruben Hopkins"], "start_year": 2006, "end_year": 2023 }, @@ -192,9 +141,7 @@ "seasons": 1, "episodes": 90, "platform": "HBO", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Eric Ryan", "Ashley Ball", @@ -211,10 +158,7 @@ "seasons": 10, "episodes": 69, "platform": "Hulu", - "genres": [ - "Documentary", - "Fantasy" - ], + "genres": ["Documentary", "Fantasy"], "cast": [ "Mrs. Olivia Booth", "William Murphy", @@ -232,14 +176,8 @@ "seasons": 3, "episodes": 89, "platform": "Disney+", - "genres": [ - "Crime" - ], - "cast": [ - "Elizabeth Lambert", - "Corey Hughes", - "Melissa Stephens" - ], + "genres": ["Crime"], + "cast": ["Elizabeth Lambert", "Corey Hughes", "Melissa Stephens"], "start_year": 2006, "end_year": null }, @@ -249,15 +187,8 @@ "seasons": 9, "episodes": 73, "platform": "Disney+", - "genres": [ - "Documentary", - "Drama" - ], - "cast": [ - "Shane Richardson", - "Lisa Cooper", - "Samantha Perkins" - ], + "genres": ["Documentary", "Drama"], + "cast": ["Shane Richardson", "Lisa Cooper", "Samantha Perkins"], "start_year": 2008, "end_year": null }, @@ -267,14 +198,8 @@ "seasons": 8, "episodes": 40, "platform": "Netflix", - "genres": [ - "Crime" - ], - "cast": [ - "Patricia Barrett", - "Scott Gonzalez", - "Michaela Johnson" - ], + "genres": ["Crime"], + "cast": ["Patricia Barrett", "Scott Gonzalez", "Michaela Johnson"], "start_year": 2006, "end_year": null }, @@ -284,14 +209,8 @@ "seasons": 8, "episodes": 61, "platform": "Hulu", - "genres": [ - "Drama" - ], - "cast": [ - "Christie Waters", - "Casey Allen", - "Nicole Frank" - ], + "genres": ["Drama"], + "cast": ["Christie Waters", "Casey Allen", "Nicole Frank"], "start_year": 2001, "end_year": 2005 }, @@ -301,9 +220,7 @@ "seasons": 10, "episodes": 89, "platform": "Hulu", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Pedro Butler", "Christian Hall", @@ -321,9 +238,7 @@ "seasons": 5, "episodes": 11, "platform": "Hulu", - "genres": [ - "Drama" - ], + "genres": ["Drama"], "cast": [ "Deborah Garcia", "Michelle Barajas", @@ -339,10 +254,7 @@ "seasons": 1, "episodes": 29, "platform": "Amazon Prime", - "genres": [ - "Fantasy", - "Documentary" - ], + "genres": ["Fantasy", "Documentary"], "cast": [ "Grace Rodriguez", "Alison Greene", @@ -358,9 +270,7 @@ "seasons": 9, "episodes": 111, "platform": "Disney+", - "genres": [ - "Documentary" - ], + "genres": ["Documentary"], "cast": [ "Emily Irwin", "Olivia Gibson", @@ -376,10 +286,7 @@ "seasons": 8, "episodes": 108, "platform": "Hulu", - "genres": [ - "Drama", - "Crime" - ], + "genres": ["Drama", "Crime"], "cast": [ "Karen Phillips", "Kelly Marsh", @@ -395,10 +302,7 @@ "seasons": 6, "episodes": 66, "platform": "Amazon Prime", - "genres": [ - "Crime", - "Documentary" - ], + "genres": ["Crime", "Documentary"], "cast": [ "Bradley Chavez", "Catherine Horn", @@ -414,15 +318,8 @@ "seasons": 9, "episodes": 22, "platform": "Hulu", - "genres": [ - "Drama" - ], - "cast": [ - "Eric Lee", - "Patrick Estrada", - "Kelsey Brown", - "Jeffrey Lewis" - ], + "genres": ["Drama"], + "cast": ["Eric Lee", "Patrick Estrada", "Kelsey Brown", "Jeffrey Lewis"], "start_year": 2001, "end_year": null }, @@ -432,9 +329,7 @@ "seasons": 5, "episodes": 35, "platform": "Hulu", - "genres": [ - "Crime" - ], + "genres": ["Crime"], "cast": [ "Chad Torres", "Mark Williams", @@ -451,10 +346,7 @@ "seasons": 2, "episodes": 94, "platform": "Netflix", - "genres": [ - "Thriller", - "Fantasy" - ], + "genres": ["Thriller", "Fantasy"], "cast": [ "Catherine Davila", "Jessica James", @@ -471,10 +363,7 @@ "seasons": 2, "episodes": 87, "platform": "Hulu", - "genres": [ - "Drama", - "Fantasy" - ], + "genres": ["Drama", "Fantasy"], "cast": [ "Tiffany Brown", "Christina Morales", @@ -491,14 +380,8 @@ "seasons": 5, "episodes": 56, "platform": "Netflix", - "genres": [ - "Comedy" - ], - "cast": [ - "James Durham", - "Jessica Myers", - "Rachel King" - ], + "genres": ["Comedy"], + "cast": ["James Durham", "Jessica Myers", "Rachel King"], "start_year": 2005, "end_year": null }, @@ -508,10 +391,7 @@ "seasons": 4, "episodes": 99, "platform": "Disney+", - "genres": [ - "Crime", - "Fantasy" - ], + "genres": ["Crime", "Fantasy"], "cast": [ "Robert Foster", "Jill Barton", @@ -527,10 +407,7 @@ "seasons": 9, "episodes": 24, "platform": "Amazon Prime", - "genres": [ - "Drama", - "Crime" - ], + "genres": ["Drama", "Crime"], "cast": [ "Carl Johnson", "Douglas Beck", @@ -548,15 +425,8 @@ "seasons": 10, "episodes": 117, "platform": "HBO", - "genres": [ - "Crime", - "Fantasy" - ], - "cast": [ - "Carol Miller", - "Jennifer Bass", - "Melanie Leblanc" - ], + "genres": ["Crime", "Fantasy"], + "cast": ["Carol Miller", "Jennifer Bass", "Melanie Leblanc"], "start_year": 2002, "end_year": null }, @@ -566,10 +436,7 @@ "seasons": 1, "episodes": 58, "platform": "Hulu", - "genres": [ - "Crime", - "Drama" - ], + "genres": ["Crime", "Drama"], "cast": [ "James Warren", "Kelly Carter", @@ -586,9 +453,7 @@ "seasons": 6, "episodes": 71, "platform": "Netflix", - "genres": [ - "Documentary" - ], + "genres": ["Documentary"], "cast": [ "Sarah Brown", "Patrick Beck", @@ -604,14 +469,8 @@ "seasons": 4, "episodes": 16, "platform": "Hulu", - "genres": [ - "Fantasy" - ], - "cast": [ - "Gabrielle Meyer", - "Madison Matthews", - "Taylor Martinez" - ], + "genres": ["Fantasy"], + "cast": ["Gabrielle Meyer", "Madison Matthews", "Taylor Martinez"], "start_year": 2010, "end_year": null }, @@ -621,14 +480,8 @@ "seasons": 1, "episodes": 79, "platform": "Hulu", - "genres": [ - "Fantasy" - ], - "cast": [ - "Michael Lewis", - "Cassandra Hicks", - "Sydney Garcia" - ], + "genres": ["Fantasy"], + "cast": ["Michael Lewis", "Cassandra Hicks", "Sydney Garcia"], "start_year": 2015, "end_year": 2023 }, @@ -638,16 +491,8 @@ "seasons": 7, "episodes": 82, "platform": "Hulu", - "genres": [ - "Crime", - "Fantasy" - ], - "cast": [ - "Keith Brown", - "Annette Johnson", - "Joseph Carroll", - "Derek Lewis" - ], + "genres": ["Crime", "Fantasy"], + "cast": ["Keith Brown", "Annette Johnson", "Joseph Carroll", "Derek Lewis"], "start_year": 2006, "end_year": 2008 }, @@ -657,10 +502,7 @@ "seasons": 2, "episodes": 52, "platform": "Amazon Prime", - "genres": [ - "Fantasy", - "Drama" - ], + "genres": ["Fantasy", "Drama"], "cast": [ "Garrett Mcgrath", "Craig Jackson", @@ -676,16 +518,8 @@ "seasons": 1, "episodes": 113, "platform": "Netflix", - "genres": [ - "Thriller", - "Comedy" - ], - "cast": [ - "Matthew Hill", - "Andrew White", - "Grant Young", - "John Mathews" - ], + "genres": ["Thriller", "Comedy"], + "cast": ["Matthew Hill", "Andrew White", "Grant Young", "John Mathews"], "start_year": 2015, "end_year": 2020 }, @@ -695,9 +529,7 @@ "seasons": 3, "episodes": 40, "platform": "Netflix", - "genres": [ - "Comedy" - ], + "genres": ["Comedy"], "cast": [ "Matthew Gordon", "Mark Allen", @@ -715,10 +547,7 @@ "seasons": 10, "episodes": 106, "platform": "HBO", - "genres": [ - "Fantasy", - "Drama" - ], + "genres": ["Fantasy", "Drama"], "cast": [ "Elizabeth Taylor", "Melissa Mullins", @@ -735,16 +564,9 @@ "seasons": 3, "episodes": 88, "platform": "HBO", - "genres": [ - "Thriller", - "Drama" - ], - "cast": [ - "Amy Aguilar", - "James Williams", - "Kevin Kirby" - ], + "genres": ["Thriller", "Drama"], + "cast": ["Amy Aguilar", "James Williams", "Kevin Kirby"], "start_year": 2010, "end_year": 2025 } -] \ No newline at end of file +] From 746d7eb09d6d608e76e999c9dfea1740bdd59a0c Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 13:05:24 +0200 Subject: [PATCH 29/47] chore: lint fixes --- tests/accuracy/delete-many.test.ts | 1 - tests/accuracy/sdk/accuracy-scorers.ts | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index 4d50169d..f9c03740 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,7 +1,6 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { deleteManyResponse } from "../../src/tools/mongodb/delete/deleteMany.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index 7bd8b969..fd692ac9 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -129,6 +129,5 @@ function compareParams(expected: unknown, actual: unknown): number { return minScore; } - // eslint-disable-next-line eqeqeq return expected == actual ? 1 : 0; } From f84bf4309375eab35f651a34de74081c3d345c23 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 16:06:21 +0200 Subject: [PATCH 30/47] chore: simplified toolCallingAccuracy calculation --- package-lock.json | 7 + tests/accuracy/sdk/accuracy-scorers.ts | 155 +++++------------- .../mdb-snapshot-storage.ts | 4 +- .../snapshot-storage.ts | 20 ++- tests/accuracy/sdk/accuracy-testing-client.ts | 6 +- tests/accuracy/sdk/describe-accuracy-tests.ts | 18 +- tests/accuracy/sdk/models.ts | 33 +++- 7 files changed, 107 insertions(+), 136 deletions(-) diff --git a/package-lock.json b/package-lock.json index 865dcb14..9a4282e7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12428,6 +12428,13 @@ "node": ">= 0.6" } }, + "node_modules/microdiff": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz", + "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==", + "dev": true, + "license": "MIT" + }, "node_modules/micromatch": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts index fd692ac9..612c3f80 100644 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ b/tests/accuracy/sdk/accuracy-scorers.ts @@ -1,133 +1,60 @@ -export type ToolCall = { - toolCallId: string; - toolName: string; - parameters: unknown; -}; -export type ExpectedToolCall = Omit; +import diff from "microdiff"; +import { ExpectedToolCall, ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; -export function toolCallingAccuracyScorer(expectedToolCalls: ExpectedToolCall[], actualToolCalls: ToolCall[]): number { - if (actualToolCalls.length < expectedToolCalls.length) { - return 0; - } - - const possibleScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; - const checkedToolCallIds = new Set(); - for (const expectedToolCall of expectedToolCalls) { - const matchingActualToolCall = actualToolCalls.find( - (actualToolCall) => - actualToolCall.toolName === expectedToolCall.toolName && - !checkedToolCallIds.has(actualToolCall.toolCallId) - ); - - if (!matchingActualToolCall) { - return 0; - } - - checkedToolCallIds.add(matchingActualToolCall.toolCallId); - } - - return possibleScore; -} - -export function parameterMatchingAccuracyScorer( +export function calculateToolCallingAccuracy( expectedToolCalls: ExpectedToolCall[], - actualToolCalls: ToolCall[] + actualToolCalls: ActualToolCall[] ): number { if (expectedToolCalls.length === 0) { - return 1; + return actualToolCalls.length === 0 ? 1 : 0.75; } - const usedActualIndexes = new Set(); - const scores: number[] = []; + const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + + const individualAccuracies: number[] = []; + const checkedActualToolCallIndexes = new Set(); for (const expectedCall of expectedToolCalls) { - // Find all unmatched actual tool calls with the same tool name const candidates = actualToolCalls .map((call, index) => ({ call, index })) - .filter(({ call, index }) => !usedActualIndexes.has(index) && call.toolName === expectedCall.toolName); - - if (candidates.length === 0) { - scores.push(0); - continue; - } - - // Pick the candidate with the best parameter match - let bestScore = -1; - let bestIndex = -1; - for (const { call, index } of candidates) { - const score = compareParams(expectedCall.parameters, call.parameters); - if (score > bestScore) { - bestScore = score; - bestIndex = index; - } - } - - usedActualIndexes.add(bestIndex); - scores.push(bestScore); - } - - const totalScore = scores.reduce((sum, score) => sum + score, 0); - return totalScore / scores.length; + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: compareParams(expectedCall.parameters, call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score); + + const bestMatch = candidates[0]; + if (!bestMatch) { + individualAccuracies.push(0); + } else { + checkedActualToolCallIndexes.add(bestMatch.index); + const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); + individualAccuracies.push(individualAccuracy); + } + } + + return Math.min(...individualAccuracies); } -/** - * Recursively compares expected and actual parameters and returns a score. - * - 1: Perfect match. - * - 0.75: All expected parameters are present and match, but there are extra actual parameters. - * - 0: Missing parameters or mismatched values. - */ -function compareParams(expected: unknown, actual: unknown): number { - if (expected === null || expected === undefined) { - return actual === null || actual === undefined ? 1 : 0; - } - if (actual === null || actual === undefined) { - return 0; - } +function compareParams(expected: Record, actual: Record): number { + const differences = diff(expected, actual); - if (Array.isArray(expected)) { - if (!Array.isArray(actual) || actual.length < expected.length) { - return 0; - } - let minScore = 1; - for (let i = 0; i < expected.length; i++) { - minScore = Math.min(minScore, compareParams(expected[i], actual[i])); - } - if (minScore === 0) { - return 0; - } - if (actual.length > expected.length) { - minScore = Math.min(minScore, 0.75); - } - return minScore; + if (differences.length === 0) { + return 1; } - if (typeof expected === "object") { - if (typeof actual !== "object" || Array.isArray(actual)) { - return 0; - } - const expectedKeys = Object.keys(expected as Record); - const actualKeys = Object.keys(actual as Record); - - let minScore = 1; - for (const key of expectedKeys) { - if (!Object.prototype.hasOwnProperty.call(actual, key)) { - return 0; - } - minScore = Math.min( - minScore, - compareParams((expected as Record)[key], (actual as Record)[key]) - ); - } + const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); + const hasRemovals = differences.some((d) => d.type === "REMOVE"); + const hasChanges = differences.some((d) => d.type === "CHANGE"); - if (minScore === 0) { - return 0; - } - - if (actualKeys.length > expectedKeys.length) { - minScore = Math.min(minScore, 0.75); - } - return minScore; + if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + return 0.75; } - return expected == actual ? 1 : 0; + return 0; } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index c93abe12..48aac4e8 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -28,11 +28,13 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { async createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "provider" | "requestedModel" | "test" | "prompt" | "toolCallingAccuracy" - | "parameterAccuracy" + | "expectedToolCalls" + | "actualToolCalls" | "llmResponseTime" | "tokensUsage" | "respondingModel" diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index eb0e453f..b254787c 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,16 +1,30 @@ import z from "zod"; +const ExpectedToolCallSchema = z.object({ + toolCallId: z.string(), + toolName: z.string(), + parameters: z.record(z.string(), z.unknown()), +}); + +const ActualToolCallSchema = ExpectedToolCallSchema.omit({ toolCallId: undefined }); + +export type ExpectedToolCall = z.infer; +export type ActualToolCall = z.infer; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), createdOn: z.number(), commitSHA: z.string(), // Accuracy info + provider: z.string(), requestedModel: z.string(), test: z.string(), prompt: z.string(), toolCallingAccuracy: z.number(), - parameterAccuracy: z.number(), + // debug info for further investigations + expectedToolCalls: ExpectedToolCallSchema.array(), + actualToolCalls: ActualToolCallSchema.array(), llmResponseTime: z.number(), tokensUsage: z .object({ @@ -30,11 +44,13 @@ export interface AccuracySnapshotStorage { createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "provider" | "requestedModel" | "test" | "prompt" | "toolCallingAccuracy" - | "parameterAccuracy" + | "expectedToolCalls" + | "actualToolCalls" | "llmResponseTime" | "tokensUsage" | "respondingModel" diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index b12017d7..8c5f27ad 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -5,7 +5,7 @@ import { experimental_createMCPClient as createMCPClient, tool as createVercelTo import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ToolCall } from "./accuracy-scorers.js"; +import { ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; const __dirname = fileURLToPath(import.meta.url); const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); @@ -16,7 +16,7 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ToolCall[] = []; + private recordedToolCalls: ExpectedToolCall[] = []; private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { @@ -33,7 +33,7 @@ export class AccuracyTestingClient { this.recordedToolCalls.push({ toolCallId: uuid(), toolName: toolName, - parameters: args, + parameters: args as Record, }); try { const toolResultGeneratorFn = this.mockedTools[toolName]; diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 5670207a..f472c7f2 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,10 +1,10 @@ import { TestableModels } from "./models.js"; -import { ExpectedToolCall, parameterMatchingAccuracyScorer, toolCallingAccuracyScorer } from "./accuracy-scorers.js"; +import { calculateToolCallingAccuracy } from "./accuracy-scorers.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; -import { AccuracySnapshotStorage } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -33,7 +33,7 @@ export function describeAccuracyTests( const eachModel = describe.each(models); const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); - eachModel(`$modelName`, function (model) { + eachModel(`$displayName`, function (model) { const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); @@ -72,20 +72,18 @@ export function describeAccuracyTests( const result = await agent.prompt(promptForModel, model, toolsForModel); const timeAfterPrompt = Date.now(); const toolCalls = testMCPClient.getToolCalls(); - const toolCallingAccuracy = toolCallingAccuracyScorer(testConfig.expectedToolCalls, toolCalls); - const parameterMatchingAccuracy = parameterMatchingAccuracyScorer( - testConfig.expectedToolCalls, - toolCalls - ); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, toolCalls); const responseTime = timeAfterPrompt - timeBeforePrompt; await accuracySnapshotStorage.createSnapshotEntry({ + provider: model.provider, requestedModel: model.modelName, test: suiteName, prompt: testConfig.prompt, llmResponseTime: responseTime, - toolCallingAccuracy, - parameterAccuracy: parameterMatchingAccuracy, + toolCallingAccuracy: toolCallingAccuracy, + actualToolCalls: toolCalls, + expectedToolCalls: testConfig.expectedToolCalls, ...result, }); }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index eb7f4b91..70b80435 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -6,13 +6,21 @@ import { ollama } from "ollama-ai-provider"; export interface Model

{ readonly modelName: string; + readonly provider: string; + readonly displayName: string; isAvailable(): boolean; getModel(): P; } export class OpenAIModel implements Model { + readonly provider = "OpenAI"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_OPEN_AI_API_KEY; } @@ -25,8 +33,14 @@ export class OpenAIModel implements Model { } export class AzureOpenAIModel implements Model { + readonly provider = "Azure"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; } @@ -41,8 +55,14 @@ export class AzureOpenAIModel implements Model { } export class GeminiModel implements Model { + readonly provider = "Google"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return !!process.env.MDB_GEMINI_API_KEY; } @@ -55,8 +75,14 @@ export class GeminiModel implements Model { } export class OllamaModel implements Model { + readonly provider = "Ollama"; + constructor(readonly modelName: string) {} + get displayName(): string { + return `${this.provider} - ${this.modelName}`; + } + isAvailable(): boolean { return true; } @@ -66,12 +92,7 @@ export class OllamaModel implements Model { } } -const ALL_TESTABLE_MODELS = [ - // new GeminiModel("gemini-2.0-flash"), - // new OpenAIModel("gpt-4o"), - new AzureOpenAIModel("gpt-4o"), - // new OllamaModel("qwen3:1.7b"), -]; +const ALL_TESTABLE_MODELS = [new AzureOpenAIModel("gpt-4o")]; export type TestableModels = ReturnType; From 496acc76fbb425aa2009c5b88d035e7e63df5ff6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 16:13:58 +0200 Subject: [PATCH 31/47] chore: account for types moved around --- tests/accuracy/collection-storage-size.test.ts | 2 +- tests/accuracy/create-collection.test.ts | 2 +- tests/accuracy/drop-collection.test.ts | 2 +- tests/accuracy/drop-database.test.ts | 2 +- tests/accuracy/logs.test.ts | 2 +- .../sdk/accuracy-snapshot-storage/snapshot-storage.ts | 3 +-- tests/accuracy/sdk/accuracy-testing-client.ts | 4 ++-- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts index 751b84d6..dbb458e1 100644 --- a/tests/accuracy/collection-storage-size.test.ts +++ b/tests/accuracy/collection-storage-size.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index ab468a62..d8a6266f 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index e51494b7..89f9cb70 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 08ffe640..0518d982 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index afd2a697..4ca148b9 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,7 +1,7 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-scorers.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index b254787c..2f9c432a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,12 +1,11 @@ import z from "zod"; const ExpectedToolCallSchema = z.object({ - toolCallId: z.string(), toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); -const ActualToolCallSchema = ExpectedToolCallSchema.omit({ toolCallId: undefined }); +const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); export type ExpectedToolCall = z.infer; export type ActualToolCall = z.infer; diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index 8c5f27ad..4a8ad279 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -5,7 +5,7 @@ import { experimental_createMCPClient as createMCPClient, tool as createVercelTo import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; const __dirname = fileURLToPath(import.meta.url); const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); @@ -16,7 +16,7 @@ export type MockedTools = Record; export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ExpectedToolCall[] = []; + private recordedToolCalls: ActualToolCall[] = []; private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { From c5ead9d52590fa64a5124e4bbbfa8d719703d97b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 21:14:01 +0200 Subject: [PATCH 32/47] chore: adds accuracyRunStatus to snapshot entries The new field `accuracyRunStatus` is supposed to help guard against cases where jest might fail in between, maybe due to LLM rate limit errors or something else, and we then have a partially saved state of an accuracy run. With the new field `accuracyRunStatus` we should be able to safely look for last runs where `accuracyRunStatus` is done and have complete state of accuracy snapshot. --- scripts/mark-accuracy-run-finished.ts | 7 ++++++ scripts/run-accuracy-tests.sh | 23 ++++++++++++++++++- .../mdb-snapshot-storage.ts | 15 +++++++++++- .../snapshot-storage.ts | 13 +++++++++-- 4 files changed, 54 insertions(+), 4 deletions(-) create mode 100644 scripts/mark-accuracy-run-finished.ts diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts new file mode 100644 index 00000000..ad3e3530 --- /dev/null +++ b/scripts/mark-accuracy-run-finished.ts @@ -0,0 +1,7 @@ +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; + +console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +const storage = await getAccuracySnapshotStorage(); +await storage.accuracyRunFinished(); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 979f49e1..20a16591 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -2,6 +2,27 @@ # Variables necessary for the accuracy test runs export MDB_ACCURACY_RUN_ID=$(npx uuid v4) +# For providing access tokens for different LLM providers +# export MDB_OPEN_AI_API_KEY="" +# export MDB_GEMINI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_URL="" + +# For providing a mongodb based storage to store accuracy snapshots +# export MDB_ACCURACY_MDB_URL="" +# export MDB_ACCURACY_MDB_DB="" +# export MDB_ACCURACY_MDB_COLLECTION="" + +# By default we run all the tests under tests/accuracy folder unless a path is +# specified in the command line. Such as: +# npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" \ No newline at end of file +node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" + +# Each test run submits an accuracy snapshot entry for each prompt with the +# accuracyRunStatus: "in-progress". When all the tests are done and jest exits +# with an exit code of 0, we can safely mark accuracy run as finished. +if [ $? -eq 0 ]; then + npx tsx scripts/mark-accuracy-run-finished.ts +fi \ No newline at end of file diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index 48aac4e8..c1e9ec5a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -1,5 +1,10 @@ import { Collection, MongoClient } from "mongodb"; -import { AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage } from "./snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { private readonly client: MongoClient; @@ -46,6 +51,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { ...snapshotEntry, commitSHA: this.commitSHA, accuracyRunId: this.accuracyRunId, + accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; await this.snapshotCollection.insertOne(snapshotWithMeta); @@ -70,6 +76,13 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } + async accuracyRunFinished(): Promise { + await this.snapshotCollection.updateMany( + { accuracyRunId: this.accuracyRunId }, + { $set: { accuracyRunStatus: AccuracyRunStatus.Done } } + ); + } + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index 2f9c432a..f77c4d79 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -4,15 +4,22 @@ const ExpectedToolCallSchema = z.object({ toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); +export type ExpectedToolCall = z.infer; const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); - -export type ExpectedToolCall = z.infer; export type ActualToolCall = z.infer; +export const AccuracyRunStatus = { + Done: "done", + InProgress: "in-progress", +} as const; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), + accuracyRunStatus: z + .enum([AccuracyRunStatus.Done, AccuracyRunStatus.InProgress]) + .default(AccuracyRunStatus.InProgress), createdOn: z.number(), commitSHA: z.string(), // Accuracy info @@ -60,5 +67,7 @@ export interface AccuracySnapshotStorage { getLatestSnapshotsForCommit(commit: string): Promise; + accuracyRunFinished(): Promise; + close(): Promise; } From b54cf14d5d5f4e1fdfdbe3683137267d412ecad7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 23:35:23 +0200 Subject: [PATCH 33/47] chore: add disk based accuracy storage for local runs --- .gitignore | 1 + .../disk-snapshot-storage.ts | 122 ++++++++++++++++++ .../get-snapshot-storage.ts | 6 +- .../mdb-snapshot-storage.ts | 18 +-- 4 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts diff --git a/.gitignore b/.gitignore index 4e3f7a54..2ac1f762 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ state.json tests/tmp coverage +.accuracy-snapshots diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts new file mode 100644 index 00000000..668e130a --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -0,0 +1,122 @@ +import path from "path"; +import fs from "fs/promises"; +import { fileURLToPath } from "url"; +import { + AccuracyRunStatus, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; +const __dirname = fileURLToPath(import.meta.url); +const rootDir = path.resolve(__dirname, "..", "..", "..", "..", ".."); +const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); +export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); + +export class DiskSnapshotStorage implements AccuracySnapshotStorage { + private constructor( + private readonly accuracyRunId: string, + private readonly commitSHA: string + ) {} + + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "provider" + | "requestedModel" + | "test" + | "prompt" + | "toolCallingAccuracy" + | "expectedToolCalls" + | "actualToolCalls" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + commitSHA: this.commitSHA, + accuracyRunId: this.accuracyRunId, + accuracyRunStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + }; + + await this.appendAccuracySnapshot(snapshotWithMeta); + } + + async getLatestSnapshotsForCommit(commit: string): Promise { + const snapshot = await this.readSnapshot(); + const entries = snapshot + .filter((entry) => { + return entry.commitSHA === commit && entry.accuracyRunStatus === AccuracyRunStatus.Done; + }) + .sort((a, b) => b.createdOn - a.createdOn); + const latestRunId = entries[0]?.accuracyRunId; + return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; + } + + async accuracyRunFinished(): Promise { + const snapshot = await this.readSnapshot(); + const updatedSnapshot = snapshot.map((entry) => { + if (entry.accuracyRunId === this.accuracyRunId) { + return { + ...entry, + accuracyRunStatus: AccuracyRunStatus.Done, + }; + } + + return entry; + }); + await this.writeSnapshot(updatedSnapshot); + } + + close(): Promise { + return Promise.resolve(); + } + + private async appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { + for (let attempt = 0; attempt < 5; attempt++) { + try { + const snapshot = await this.readSnapshot(); + snapshot.unshift(entry); + await this.writeSnapshot(snapshot); + return; + } catch (e) { + if (attempt < 4) { + await this.waitFor(100 + Math.random() * 200); + } else { + throw e; + } + } + } + } + + private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { + const tmp = `${snapshotFilePath}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); + await fs.rename(tmp, snapshotFilePath); + } + + private async readSnapshot(): Promise { + try { + const raw = await fs.readFile(snapshotFilePath, "utf8"); + return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); + } catch (e: unknown) { + if ((e as { code: string }).code === "ENOENT") { + return []; + } + throw e; + } + } + + private waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); + } + + static async getStorage(commitSHA: string, accuracyRunId: string) { + await fs.mkdir(snapshotsDir, { recursive: true }); + return new DiskSnapshotStorage(commitSHA, accuracyRunId); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 44c8ae3d..020afc79 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -1,4 +1,5 @@ import { getCommitSHA } from "../git-info.js"; +import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; import { AccuracySnapshotStorage } from "./snapshot-storage.js"; @@ -15,5 +16,8 @@ export async function getAccuracySnapshotStorage(): Promise { - const latestRunId = await this.getLastRunIdForCommit(commit); + const latestRunId = await this.getLatestAccuracyRunForCommit(commit); return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; } - private async getLastRunIdForCommit(commit: string): Promise { + private async getLatestAccuracyRunForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( - { commit: commit }, + { commit: commit, accuracyRunStatus: AccuracyRunStatus.Done }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } ); @@ -83,12 +83,16 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { ); } - static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage { + async close(): Promise { + await this.client.close(); + } + + static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage | null { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; if (!mongodbUrl || !database || !collection) { - throw new Error("Cannot create MongoDBAccuracySnapshot storage without relevant configuration provided"); + return null; } return new MongoDBSnapshotStorage({ @@ -99,8 +103,4 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { accuracyRunId, }); } - - async close(): Promise { - await this.client.close(); - } } From 188aebcf5b99dfa5999aed386ad81989f3b17ac6 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Tue, 8 Jul 2025 23:40:07 +0200 Subject: [PATCH 34/47] chore: revert changes done to any of the src files --- src/tools/mongodb/create/insertMany.ts | 28 +++---- src/tools/mongodb/delete/deleteMany.ts | 20 ++--- .../mongodb/metadata/collectionSchema.ts | 59 +++++++-------- src/tools/mongodb/metadata/listCollections.ts | 45 +++++------ src/tools/mongodb/metadata/listDatabases.ts | 23 ++---- src/tools/mongodb/read/collectionIndexes.ts | 74 ++++++------------- src/tools/mongodb/read/find.ts | 34 ++++----- 7 files changed, 110 insertions(+), 173 deletions(-) diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index c92ee4c3..f28d79d5 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -3,21 +3,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function insertManyResponse(collection: string, insertedCount: number, insertedIds: unknown[]): CallToolResult { - return { - content: [ - { - text: `Inserted \`${insertedCount}\` document(s) into collection "${collection}"`, - type: "text", - }, - { - text: `Inserted IDs: ${insertedIds.join(", ")}`, - type: "text", - }, - ], - }; -} - export class InsertManyTool extends MongoDBToolBase { protected name = "insert-many"; protected description = "Insert an array of documents into a MongoDB collection"; @@ -39,6 +24,17 @@ export class InsertManyTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const result = await provider.insertMany(database, collection, documents); - return insertManyResponse(collection, result.insertedCount, Object.values(result.insertedIds)); + return { + content: [ + { + text: `Inserted \`${result.insertedCount}\` document(s) into collection "${collection}"`, + type: "text", + }, + { + text: `Inserted IDs: ${Object.values(result.insertedIds).join(", ")}`, + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index 4bc8eba0..0257d167 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -4,17 +4,6 @@ import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; -export function deleteManyResponse(collection: string, delectedCount: number): CallToolResult { - return { - content: [ - { - text: `Deleted \`${delectedCount}\` document(s) from collection "${collection}"`, - type: "text", - }, - ], - }; -} - export class DeleteManyTool extends MongoDBToolBase { protected name = "delete-many"; protected description = "Removes all documents that match the filter from a MongoDB collection"; @@ -56,6 +45,13 @@ export class DeleteManyTool extends MongoDBToolBase { const result = await provider.deleteMany(database, collection, filter); - return deleteManyResponse(collection, result.deletedCount); + return { + content: [ + { + text: `Deleted \`${result.deletedCount}\` document(s) from collection "${collection}"`, + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/metadata/collectionSchema.ts b/src/tools/mongodb/metadata/collectionSchema.ts index 71ed5256..f0145323 100644 --- a/src/tools/mongodb/metadata/collectionSchema.ts +++ b/src/tools/mongodb/metadata/collectionSchema.ts @@ -1,38 +1,7 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -import { getSimplifiedSchema, SimplifiedSchema } from "mongodb-schema"; - -export function collectionSchemaResponse( - database: string, - collection: string, - schema: SimplifiedSchema -): CallToolResult { - const fieldsCount = Object.entries(schema).length; - if (fieldsCount === 0) { - return { - content: [ - { - text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, - type: "text", - }, - { - text: JSON.stringify(schema), - type: "text", - }, - ], - }; -} +import { getSimplifiedSchema } from "mongodb-schema"; export class CollectionSchemaTool extends MongoDBToolBase { protected name = "collection-schema"; @@ -45,6 +14,30 @@ export class CollectionSchemaTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const documents = await provider.find(database, collection, {}, { limit: 5 }).toArray(); const schema = await getSimplifiedSchema(documents); - return collectionSchemaResponse(database, collection, schema); + + const fieldsCount = Object.entries(schema).length; + if (fieldsCount === 0) { + return { + content: [ + { + text: `Could not deduce the schema for "${database}.${collection}". This may be because it doesn't exist or is empty.`, + type: "text", + }, + ], + }; + } + + return { + content: [ + { + text: `Found ${fieldsCount} fields in the schema for "${database}.${collection}"`, + type: "text", + }, + { + text: JSON.stringify(schema), + type: "text", + }, + ], + }; } } diff --git a/src/tools/mongodb/metadata/listCollections.ts b/src/tools/mongodb/metadata/listCollections.ts index f676964f..193d0465 100644 --- a/src/tools/mongodb/metadata/listCollections.ts +++ b/src/tools/mongodb/metadata/listCollections.ts @@ -2,28 +2,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function listCollectionsResponse(database: string, collections: string[]): CallToolResult { - if (collections.length === 0) { - return { - content: [ - { - type: "text", - text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, - }, - ], - }; - } - - return { - content: collections.map((collection) => { - return { - text: `Name: "${collection}"`, - type: "text", - }; - }), - }; -} - export class ListCollectionsTool extends MongoDBToolBase { protected name = "list-collections"; protected description = "List all collections for a given database"; @@ -37,9 +15,24 @@ export class ListCollectionsTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const collections = await provider.listCollections(database); - return listCollectionsResponse( - database, - collections.map((collection) => `${collection.name}`) - ); + if (collections.length === 0) { + return { + content: [ + { + type: "text", + text: `No collections found for database "${database}". To create a collection, use the "create-collection" tool.`, + }, + ], + }; + } + + return { + content: collections.map((collection) => { + return { + text: `Name: "${collection.name}"`, + type: "text", + }; + }), + }; } } diff --git a/src/tools/mongodb/metadata/listDatabases.ts b/src/tools/mongodb/metadata/listDatabases.ts index 1d1ae4d2..fe324f07 100644 --- a/src/tools/mongodb/metadata/listDatabases.ts +++ b/src/tools/mongodb/metadata/listDatabases.ts @@ -3,17 +3,6 @@ import { MongoDBToolBase } from "../mongodbTool.js"; import * as bson from "bson"; import { OperationType } from "../../tool.js"; -export function listDatabasesResponse(databases: { name: string; sizeOnDisk: string }[]): CallToolResult { - return { - content: databases.map((db) => { - return { - text: `Name: ${db.name}, Size: ${db.sizeOnDisk} bytes`, - type: "text", - }; - }), - }; -} - export class ListDatabasesTool extends MongoDBToolBase { protected name = "list-databases"; protected description = "List all databases for a MongoDB connection"; @@ -24,13 +13,13 @@ export class ListDatabasesTool extends MongoDBToolBase { const provider = await this.ensureConnected(); const dbs = (await provider.listDatabases("")).databases as { name: string; sizeOnDisk: bson.Long }[]; - return listDatabasesResponse( - dbs.map((db) => { + return { + content: dbs.map((db) => { return { - name: db.name, - sizeOnDisk: db.sizeOnDisk.toString(), + text: `Name: ${db.name}, Size: ${db.sizeOnDisk.toString()} bytes`, + type: "text", }; - }) - ); + }), + }; } } diff --git a/src/tools/mongodb/read/collectionIndexes.ts b/src/tools/mongodb/read/collectionIndexes.ts index 71ade728..cc0a141b 100644 --- a/src/tools/mongodb/read/collectionIndexes.ts +++ b/src/tools/mongodb/read/collectionIndexes.ts @@ -2,44 +2,6 @@ import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { DbOperationArgs, MongoDBToolBase } from "../mongodbTool.js"; import { ToolArgs, OperationType } from "../../tool.js"; -export function collectionIndexesResponse({ - database, - collection, - indexes = [], - namespaceNotFound, -}: { - database: string; - collection: string; - indexes?: { name: string; key: string }[]; - namespaceNotFound?: boolean; -}): CallToolResult { - if (namespaceNotFound) { - return { - content: [ - { - text: `The indexes for "${database}.${collection}" cannot be determined because the collection does not exist.`, - type: "text", - }, - ], - }; - } - - return { - content: [ - { - text: `Found ${indexes.length} indexes in the collection "${collection}":`, - type: "text", - }, - ...(indexes.map((indexDefinition) => { - return { - text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, - type: "text", - }; - }) as { text: string; type: "text" }[]), - ], - }; -} - export class CollectionIndexesTool extends MongoDBToolBase { protected name = "collection-indexes"; protected description = "Describe the indexes for a collection"; @@ -49,14 +11,21 @@ export class CollectionIndexesTool extends MongoDBToolBase { protected async execute({ database, collection }: ToolArgs): Promise { const provider = await this.ensureConnected(); const indexes = await provider.getIndexes(database, collection); - return collectionIndexesResponse({ - database, - collection, - indexes: indexes.map((index) => ({ - name: `${index.name}`, - key: JSON.stringify(index.key), - })), - }); + + return { + content: [ + { + text: `Found ${indexes.length} indexes in the collection "${collection}":`, + type: "text", + }, + ...(indexes.map((indexDefinition) => { + return { + text: `Name "${indexDefinition.name}", definition: ${JSON.stringify(indexDefinition.key)}`, + type: "text", + }; + }) as { text: string; type: "text" }[]), + ], + }; } protected handleError( @@ -64,11 +33,14 @@ export class CollectionIndexesTool extends MongoDBToolBase { args: ToolArgs ): Promise | CallToolResult { if (error instanceof Error && "codeName" in error && error.codeName === "NamespaceNotFound") { - return collectionIndexesResponse({ - database: args.database, - collection: args.collection, - namespaceNotFound: true, - }); + return { + content: [ + { + text: `The indexes for "${args.database}.${args.collection}" cannot be determined because the collection does not exist.`, + type: "text", + }, + ], + }; } return super.handleError(error, args); diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index ac864b0a..97c90e08 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -22,23 +22,6 @@ export const FindArgs = { .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), }; -export function findResponse(collection: string, documents: unknown[]): CallToolResult { - return { - content: [ - { - text: `Found ${documents.length} documents in the collection "${collection}":`, - type: "text", - }, - ...documents.map<{ type: "text"; text: string }>((doc) => { - return { - text: EJSON.stringify(doc), - type: "text", - }; - }), - ], - }; -} - export class FindTool extends MongoDBToolBase { protected name = "find"; protected description = "Run a find query against a MongoDB collection"; @@ -67,6 +50,21 @@ export class FindTool extends MongoDBToolBase { const documents = await provider.find(database, collection, filter, { projection, limit, sort }).toArray(); - return findResponse(collection, documents); + const content: Array<{ text: string; type: "text" }> = [ + { + text: `Found ${documents.length} documents in the collection "${collection}":`, + type: "text", + }, + ...documents.map((doc) => { + return { + text: EJSON.stringify(doc), + type: "text", + } as { text: string; type: "text" }; + }), + ]; + + return { + content, + }; } } From b309fb417b475da8a097cc41a3e06817bc869dfb Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 00:02:41 +0200 Subject: [PATCH 35/47] chore: handle test failures and appropriately mark them as failed in accuracyRunStatus --- scripts/mark-accuracy-run-finished.ts | 20 +++++++++++++--- scripts/run-accuracy-tests.sh | 7 ++++-- tests/accuracy/collection-schema.test.ts | 24 +------------------ .../disk-snapshot-storage.ts | 5 ++-- .../mdb-snapshot-storage.ts | 5 ++-- .../snapshot-storage.ts | 7 ++++-- 6 files changed, 34 insertions(+), 34 deletions(-) diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts index ad3e3530..8c1a397c 100644 --- a/scripts/mark-accuracy-run-finished.ts +++ b/scripts/mark-accuracy-run-finished.ts @@ -1,7 +1,21 @@ import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; -console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +let status: AccuracyRunStatuses | undefined; +if (process.env.MDB_ACCURACY_RUN_STATUS === "done") { + status = AccuracyRunStatus.Done; +} else if (process.env.MDB_ACCURACY_RUN_STATUS === "failed") { + status = AccuracyRunStatus.Failed; +} else { + console.info(`Unknown status - ${process.env.MDB_ACCURACY_RUN_STATUS}, will not update accuracy run.`); + process.exit(1); +} + +console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); const storage = await getAccuracySnapshotStorage(); -await storage.accuracyRunFinished(); +await storage.updateAccuracyRunStatus(status); await storage.close(); -console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as finished in`); +console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 20a16591..38d11a99 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -22,7 +22,10 @@ node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern " # Each test run submits an accuracy snapshot entry for each prompt with the # accuracyRunStatus: "in-progress". When all the tests are done and jest exits -# with an exit code of 0, we can safely mark accuracy run as finished. +# with an exit code of 0, we can safely mark accuracy run as finished otherwise +# failed. if [ $? -eq 0 ]; then - npx tsx scripts/mark-accuracy-run-finished.ts + MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/mark-accuracy-run-finished.ts +else + MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/mark-accuracy-run-finished.ts fi \ No newline at end of file diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index f81273ea..2866e709 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,34 +1,12 @@ import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { collectionSchemaResponse } from "../../src/tools/mongodb/metadata/collectionSchema.js"; -import { getSimplifiedSchema } from "mongodb-schema"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { return { injectConnectedAssumption: true, prompt: prompt, - mockedTools: { - "collection-schema": async function collectionSchema() { - return collectionSchemaResponse( - "db1", - "coll1", - await getSimplifiedSchema([ - { - name: "Sample name1", - dob: "28.11.2001", - location: "NY", - }, - { - name: "Sample name1", - dob: "28.11.2001", - location: "NY", - title: "Dr.", - }, - ]) - ); - }, - }, + mockedTools: {}, expectedToolCalls: [ { toolName: "collection-schema", diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index 668e130a..58bc396b 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -3,6 +3,7 @@ import fs from "fs/promises"; import { fileURLToPath } from "url"; import { AccuracyRunStatus, + AccuracyRunStatuses, AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage, @@ -57,13 +58,13 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; } - async accuracyRunFinished(): Promise { + async updateAccuracyRunStatus(status: AccuracyRunStatuses) { const snapshot = await this.readSnapshot(); const updatedSnapshot = snapshot.map((entry) => { if (entry.accuracyRunId === this.accuracyRunId) { return { ...entry, - accuracyRunStatus: AccuracyRunStatus.Done, + accuracyRunStatus: status, }; } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index bb506ab2..193ba9f9 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -1,6 +1,7 @@ import { Collection, MongoClient } from "mongodb"; import { AccuracyRunStatus, + AccuracyRunStatuses, AccuracySnapshotEntry, AccuracySnapshotEntrySchema, AccuracySnapshotStorage, @@ -76,10 +77,10 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } - async accuracyRunFinished(): Promise { + async updateAccuracyRunStatus(status: AccuracyRunStatuses) { await this.snapshotCollection.updateMany( { accuracyRunId: this.accuracyRunId }, - { $set: { accuracyRunStatus: AccuracyRunStatus.Done } } + { $set: { accuracyRunStatus: status } } ); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index f77c4d79..4daf1476 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -11,14 +11,17 @@ export type ActualToolCall = z.infer; export const AccuracyRunStatus = { Done: "done", + Failed: "failed", InProgress: "in-progress", } as const; +export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; + export const AccuracySnapshotEntrySchema = z.object({ // Git and meta information for snapshot entries accuracyRunId: z.string(), accuracyRunStatus: z - .enum([AccuracyRunStatus.Done, AccuracyRunStatus.InProgress]) + .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) .default(AccuracyRunStatus.InProgress), createdOn: z.number(), commitSHA: z.string(), @@ -67,7 +70,7 @@ export interface AccuracySnapshotStorage { getLatestSnapshotsForCommit(commit: string): Promise; - accuracyRunFinished(): Promise; + updateAccuracyRunStatus(status: AccuracyRunStatuses): Promise; close(): Promise; } From 43493f3b3aa31e4c7a98abf6220cc4b55586ab3a Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 17:55:05 +0200 Subject: [PATCH 36/47] chore: make snapshot storage independent of accuracyRunId and commitSHA --- scripts/mark-accuracy-run-finished.ts | 21 ------------ scripts/run-accuracy-tests.sh | 12 ++++--- scripts/update-accuracy-run-status.ts | 22 ++++++++++++ .../disk-snapshot-storage.ts | 24 ++++++------- .../get-snapshot-storage.ts | 5 +-- .../mdb-snapshot-storage.ts | 34 +++++++------------ .../snapshot-storage.ts | 8 +++-- tests/accuracy/sdk/describe-accuracy-tests.ts | 17 +++++++++- 8 files changed, 77 insertions(+), 66 deletions(-) delete mode 100644 scripts/mark-accuracy-run-finished.ts create mode 100644 scripts/update-accuracy-run-status.ts diff --git a/scripts/mark-accuracy-run-finished.ts b/scripts/mark-accuracy-run-finished.ts deleted file mode 100644 index 8c1a397c..00000000 --- a/scripts/mark-accuracy-run-finished.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, -} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; - -let status: AccuracyRunStatuses | undefined; -if (process.env.MDB_ACCURACY_RUN_STATUS === "done") { - status = AccuracyRunStatus.Done; -} else if (process.env.MDB_ACCURACY_RUN_STATUS === "failed") { - status = AccuracyRunStatus.Failed; -} else { - console.info(`Unknown status - ${process.env.MDB_ACCURACY_RUN_STATUS}, will not update accuracy run.`); - process.exit(1); -} - -console.time(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); -const storage = await getAccuracySnapshotStorage(); -await storage.updateAccuracyRunStatus(status); -await storage.close(); -console.timeEnd(`Marked accuracy run id - ${process.env.MDB_ACCURACY_RUN_ID} as ${status} in`); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index 38d11a99..a9a255f2 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -24,8 +24,12 @@ node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern " # accuracyRunStatus: "in-progress". When all the tests are done and jest exits # with an exit code of 0, we can safely mark accuracy run as finished otherwise # failed. -if [ $? -eq 0 ]; then - MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/mark-accuracy-run-finished.ts +JEST_EXIT_CODE=$? +if [ $JEST_EXIT_CODE -eq 0 ]; then + MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" else - MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/mark-accuracy-run-finished.ts -fi \ No newline at end of file + MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" +fi + +# Preserve the original Jest exit code for CI +exit $JEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts new file mode 100644 index 00000000..6d1a8bb8 --- /dev/null +++ b/scripts/update-accuracy-run-status.ts @@ -0,0 +1,22 @@ +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; +const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; + +let status: AccuracyRunStatuses | undefined; +if ( + !envAccuracyRunId || + (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) +) { + process.exit(1); +} + +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +const storage = await getAccuracySnapshotStorage(); +await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index 58bc396b..a4d2bea0 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -14,14 +14,11 @@ const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); export class DiskSnapshotStorage implements AccuracySnapshotStorage { - private constructor( - private readonly accuracyRunId: string, - private readonly commitSHA: string - ) {} - async createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" | "provider" | "requestedModel" | "test" @@ -38,8 +35,6 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { ): Promise { const snapshotWithMeta: AccuracySnapshotEntry = { ...snapshotEntry, - commitSHA: this.commitSHA, - accuracyRunId: this.accuracyRunId, accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; @@ -47,7 +42,7 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { await this.appendAccuracySnapshot(snapshotWithMeta); } - async getLatestSnapshotsForCommit(commit: string): Promise { + async getLatestSnapshotForCommit(commit: string): Promise { const snapshot = await this.readSnapshot(); const entries = snapshot .filter((entry) => { @@ -58,10 +53,15 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; } - async updateAccuracyRunStatus(status: AccuracyRunStatuses) { + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshot = await this.readSnapshot(); + return snapshot.filter((entry) => entry.accuracyRunId === accuracyRunId); + } + + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { const snapshot = await this.readSnapshot(); const updatedSnapshot = snapshot.map((entry) => { - if (entry.accuracyRunId === this.accuracyRunId) { + if (entry.accuracyRunId === accuracyRunId) { return { ...entry, accuracyRunStatus: status, @@ -116,8 +116,8 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { return new Promise((resolve) => setTimeout(resolve, ms)); } - static async getStorage(commitSHA: string, accuracyRunId: string) { + static async getStorage() { await fs.mkdir(snapshotsDir, { recursive: true }); - return new DiskSnapshotStorage(commitSHA, accuracyRunId); + return new DiskSnapshotStorage(); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 020afc79..3bec4c53 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -16,8 +16,5 @@ export async function getAccuracySnapshotStorage(): Promise { const snapshotWithMeta: AccuracySnapshotEntry = { ...snapshotEntry, - commitSHA: this.commitSHA, - accuracyRunId: this.accuracyRunId, accuracyRunStatus: AccuracyRunStatus.InProgress, createdOn: Date.now(), }; await this.snapshotCollection.insertOne(snapshotWithMeta); } - async getLatestSnapshotsForCommit(commit: string): Promise { + async getLatestSnapshotForCommit(commit: string): Promise { const latestRunId = await this.getLatestAccuracyRunForCommit(commit); - return latestRunId ? this.getSnapshotEntriesForRunId(latestRunId) : []; + return latestRunId ? this.getSnapshotForAccuracyRun(latestRunId) : []; + } + + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); + return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); } private async getLatestAccuracyRunForCommit(commit: string): Promise { @@ -72,14 +69,9 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; } - private async getSnapshotEntriesForRunId(accuracyRunId: string): Promise { - const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); - return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); - } - - async updateAccuracyRunStatus(status: AccuracyRunStatuses) { + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { await this.snapshotCollection.updateMany( - { accuracyRunId: this.accuracyRunId }, + { accuracyRunId: accuracyRunId }, { $set: { accuracyRunStatus: status } } ); } @@ -88,7 +80,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { await this.client.close(); } - static getStorage(commitSHA: string, accuracyRunId: string): MongoDBSnapshotStorage | null { + static getStorage(): MongoDBSnapshotStorage | null { const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; const database = process.env.MDB_ACCURACY_MDB_DB; const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; @@ -100,8 +92,6 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { mongodbUrl, database, collection, - commitSHA, - accuracyRunId, }); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index 4daf1476..e7833456 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -53,6 +53,8 @@ export interface AccuracySnapshotStorage { createSnapshotEntry( snapshotEntry: Pick< AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" | "provider" | "requestedModel" | "test" @@ -68,9 +70,11 @@ export interface AccuracySnapshotStorage { > ): Promise; - getLatestSnapshotsForCommit(commit: string): Promise; + getLatestSnapshotForCommit(commit: string): Promise; - updateAccuracyRunStatus(status: AccuracyRunStatuses): Promise; + getSnapshotForAccuracyRun(accuracyRunId: string): Promise; + + updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses): Promise; close(): Promise; } diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index f472c7f2..1dd6d971 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -5,6 +5,7 @@ import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/ import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { getCommitSHA } from "./git-info.js"; export interface AccuracyTestConfig { systemPrompt?: string; @@ -26,8 +27,12 @@ export function describeAccuracyTests( [suiteName: string]: AccuracyTestConfig[]; } ) { + if (!process.env.MDB_ACCURACY_RUN_ID) { + throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); + } + if (!models.length) { - throw new Error("No models available to test!"); + throw new Error("No models available to test. Ensure that the API keys are properly setup!"); } const eachModel = describe.each(models); @@ -37,11 +42,19 @@ export function describeAccuracyTests( const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + const accuracyRunId: string = `${process.env.MDB_ACCURACY_RUN_ID}`; + let commitSHA: string; let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; let agent: VercelAgent; beforeAll(async () => { + const retrievedCommitSHA = await getCommitSHA(); + if (!retrievedCommitSHA) { + throw new Error("Could not derive commitSHA, exiting accuracy tests!"); + } + + commitSHA = retrievedCommitSHA; accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); @@ -76,6 +89,8 @@ export function describeAccuracyTests( const responseTime = timeAfterPrompt - timeBeforePrompt; await accuracySnapshotStorage.createSnapshotEntry({ + accuracyRunId, + commitSHA, provider: model.provider, requestedModel: model.modelName, test: suiteName, From cb46c43cdf5d73b0a11123eb3cc06c22b19858fe Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Wed, 9 Jul 2025 18:18:53 +0200 Subject: [PATCH 37/47] chore: bail on first failure and add some explanation for update-accuracy-status script --- scripts/run-accuracy-tests.sh | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index a9a255f2..e009661f 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -18,18 +18,30 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --testPathPattern "$TEST_PATH_PATTERN" "$@" +node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPattern "$TEST_PATH_PATTERN" "$@" -# Each test run submits an accuracy snapshot entry for each prompt with the -# accuracyRunStatus: "in-progress". When all the tests are done and jest exits -# with an exit code of 0, we can safely mark accuracy run as finished otherwise -# failed. +# Preserving the exit code from test run to correctly notify in the CI +# environments when the tests fail. JEST_EXIT_CODE=$? + +# Each test run submits an accuracy snapshot entry with the accuracyRunStatus: +# "in-progress". When all the tests are done and jest exits with an exit code of +# 0, we can safely mark accuracy run as finished otherwise failed. + +# This "outside-the-tests-status-update" is arising out of the fact that each +# test suite stores their own accuracy run data in the storage and this setup +# might lead to data inconsistency when the tests fail. To overcome that each +# accuracy snapshot entry has a status which by default is "in-progress" and is +# updated when the tests either pass (all our accuracy tests are supposed to +# pass unless some errors occurs during the test runs), or fail. + +# This is necessary when comparing one accuracy run with another as we wouldn't +# want to compare against an incomplete run. if [ $JEST_EXIT_CODE -eq 0 ]; then MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" else MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" fi -# Preserve the original Jest exit code for CI + exit $JEST_EXIT_CODE \ No newline at end of file From 9db296ee45d7b97e4982d3be6b367d563ad45a96 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 00:17:08 +0200 Subject: [PATCH 38/47] chore: refactor to make tests writing simpler and other QOL improvements. 1. Removes unnecessary suite description from tests 2. Removes the test suite name from the storage as well 3. Centralize the constants used everywhere in the SDK 4. Adds clarifying comments and docs wherever necessary 5. Write tests for accuracy-scorer --- .gitignore | 3 +- scripts/update-accuracy-run-status.ts | 10 +- tests/accuracy/aggregate.test.ts | 26 +-- tests/accuracy/collection-indexes.test.ts | 20 +- tests/accuracy/collection-schema.test.ts | 14 +- .../accuracy/collection-storage-size.test.ts | 35 ++- tests/accuracy/count.test.ts | 36 ++-- tests/accuracy/create-collection.test.ts | 48 ++--- tests/accuracy/create-index.test.ts | 28 ++- tests/accuracy/db-stats.test.ts | 10 +- tests/accuracy/delete-many.test.ts | 20 +- tests/accuracy/drop-collection.test.ts | 90 ++++---- tests/accuracy/drop-database.test.ts | 38 ++-- tests/accuracy/explain.test.ts | 32 ++- tests/accuracy/find.test.ts | 58 ++--- tests/accuracy/insert-many.test.ts | 28 +-- tests/accuracy/list-collections.test.ts | 20 +- tests/accuracy/list-databases.test.ts | 16 +- tests/accuracy/logs.test.ts | 36 ++-- tests/accuracy/rename-collection.test.ts | 20 +- tests/accuracy/sdk/accuracy-scorer.ts | 114 ++++++++++ tests/accuracy/sdk/accuracy-scorers.ts | 60 ------ .../disk-snapshot-storage.ts | 16 +- .../get-snapshot-storage.ts | 13 -- .../mdb-snapshot-storage.ts | 1 - .../snapshot-storage.ts | 67 +++++- tests/accuracy/sdk/accuracy-testing-client.ts | 39 ++-- tests/accuracy/sdk/agent.ts | 14 +- tests/accuracy/sdk/constants.ts | 18 ++ tests/accuracy/sdk/describe-accuracy-tests.ts | 112 +++++----- tests/accuracy/sdk/models.ts | 28 ++- tests/accuracy/update-many.test.ts | 26 +-- tests/unit/accuracy-scorer.test.ts | 199 ++++++++++++++++++ 33 files changed, 739 insertions(+), 556 deletions(-) create mode 100644 tests/accuracy/sdk/accuracy-scorer.ts delete mode 100644 tests/accuracy/sdk/accuracy-scorers.ts create mode 100644 tests/accuracy/sdk/constants.ts create mode 100644 tests/unit/accuracy-scorer.test.ts diff --git a/.gitignore b/.gitignore index 2ac1f762..49550e27 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ state.json tests/tmp coverage -.accuracy-snapshots +# Generated assets by accuracy runs +.accuracy diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts index 6d1a8bb8..6d8e3895 100644 --- a/scripts/update-accuracy-run-status.ts +++ b/scripts/update-accuracy-run-status.ts @@ -1,13 +1,9 @@ import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; -import { - AccuracyRunStatus, - AccuracyRunStatuses, -} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; -let status: AccuracyRunStatuses | undefined; if ( !envAccuracyRunId || (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) @@ -15,8 +11,8 @@ if ( process.exit(1); } -console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); const storage = await getAccuracySnapshotStorage(); await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); await storage.close(); -console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts index 3da1ca32..30a5a0e3 100644 --- a/tests/accuracy/aggregate.test.ts +++ b/tests/accuracy/aggregate.test.ts @@ -1,28 +1,16 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -function callsAggregate(prompt: string, pipeline: Record[]): AccuracyTestConfig { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", expectedToolCalls: [ { toolName: "aggregate", parameters: { - pipeline: pipeline, + pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } }, }, }, ], - }; -} - -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'aggregate' tool", [ - callsAggregate( - "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", - [{ $group: { _id: "$release_year", count: { $sum: 1 } } }] - ), - ]), -}); + }, +]); diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts index e53ddb43..dab7d317 100644 --- a/tests/accuracy/collection-indexes.test.ts +++ b/tests/accuracy/collection-indexes.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionIndexes(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "collection-indexes", @@ -19,12 +17,10 @@ function callsCollectionIndexes(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'collection-indexes' tool", [ - callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), - callsCollectionIndexes("List all the indexes in movies collection in mflix database"), - callsCollectionIndexes( - `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), + callsCollectionIndexes("List all the indexes in movies collection in mflix database"), + callsCollectionIndexes( + `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` + ), +]); diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts index 2866e709..f2f22a88 100644 --- a/tests/accuracy/collection-schema.test.ts +++ b/tests/accuracy/collection-schema.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCollectionSchema(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "collection-schema", @@ -19,9 +17,7 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'collection-schema' tool", [ - callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), - callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), +]); diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts index dbb458e1..2bd2f021 100644 --- a/tests/accuracy/collection-storage-size.test.ts +++ b/tests/accuracy/collection-storage-size.test.ts @@ -1,20 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; -import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; -import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; -function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { - return { - injectConnectedAssumption: true, - prompt: prompt, - mockedTools: {}, - expectedToolCalls: expectedToolCalls, - }; -} - -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'collection-storage-size' tool", [ - callsCollectionStorageSize("What is the size of 'mflix.movies' namespace", [ +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "What is the size of 'mflix.movies' namespace", + expectedToolCalls: [ { toolName: "collection-storage-size", parameters: { @@ -22,10 +12,11 @@ describeAccuracyTests(getAvailableModels(), { collection: "movies", }, }, - ]), - ]), - ...describeSuite("should call 'collection-storage-size' tool after another tool/s", [ - callsCollectionStorageSize("How much size is each collection in comics database", [ + ], + }, + { + prompt: "How much size is each collection in comics database", + expectedToolCalls: [ { toolName: "list-collections", parameters: { @@ -46,6 +37,6 @@ describeAccuracyTests(getAvailableModels(), { collection: "characters", }, }, - ]), - ]), -}); + ], + }, +]); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts index 0543af76..09db4678 100644 --- a/tests/accuracy/count.test.ts +++ b/tests/accuracy/count.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "count", @@ -26,9 +24,7 @@ function callsCountToolWithQuery( query: Record = {} ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "count", @@ -42,19 +38,17 @@ function callsCountToolWithQuery( }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'count' tool", [ - callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), - callsCountToolWithEmptyQuery( - "How many documents are there in 'characters' collection in 'comics' database?", - "comics", - "characters" - ), - callsCountToolWithQuery( - "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", - "mflix", - "movies", - { runtime: { $lt: 100 } } - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), + callsCountToolWithEmptyQuery( + "How many documents are there in 'characters' collection in 'comics' database?", + "comics", + "characters" + ), + callsCountToolWithQuery( + "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + "mflix", + "movies", + { runtime: { $lt: 100 } } + ), +]); diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts index d8a6266f..db7f888c 100644 --- a/tests/accuracy/create-collection.test.ts +++ b/tests/accuracy/create-collection.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "create-collection", @@ -29,29 +27,25 @@ function callsCreateCollectionWithListCollections(prompt: string, expectedToolCa }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'create-collection' tool", [ - callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), - callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), - ]), - ...describeSuite("should call 'create-collection' alongside other required tools", [ - callsCreateCollectionWithListCollections( - "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", - [ - { - toolName: "list-collections", - parameters: { - database: "mflix", - }, +describeAccuracyTests(getAvailableModels(), [ + callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), + callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), + callsCreateCollectionWithListCollections( + "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + [ + { + toolName: "list-collections", + parameters: { + database: "mflix", }, - { - toolName: "create-collection", - parameters: { - database: "mflix", - collection: "documentaries", - }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", }, - ] - ), - ]), -}); + }, + ] + ), +]); diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts index 82e98e92..6dae12e5 100644 --- a/tests/accuracy/create-index.test.ts +++ b/tests/accuracy/create-index.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "create-index", @@ -20,16 +18,14 @@ function callsCreateIndex(prompt: string, indexKeys: Record): A }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'create-index' tool", [ - callsCreateIndex( - "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", - { - release_year: 1, - } - ), - callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { - title: "text", - }), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsCreateIndex( + "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + { + release_year: 1, + } + ), + callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { + title: "text", + }), +]); diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts index b88fbb3c..656eccc2 100644 --- a/tests/accuracy/db-stats.test.ts +++ b/tests/accuracy/db-stats.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "db-stats", @@ -18,8 +16,4 @@ function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestCon }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'db-stats' tool", [ - callsListDatabases("What is the size occupied by database mflix?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts index f9c03740..c0dd4d51 100644 --- a/tests/accuracy/delete-many.test.ts +++ b/tests/accuracy/delete-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", @@ -21,9 +19,7 @@ function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "delete-many", @@ -37,12 +33,8 @@ function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'delete-many' tool", [ - callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), - callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), - callsDeleteManyWithFilters( - "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100" - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), + callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"), +]); diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts index 89f9cb70..98ba3348 100644 --- a/tests/accuracy/drop-collection.test.ts +++ b/tests/accuracy/drop-collection.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "drop-collection", @@ -22,61 +20,55 @@ function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls, }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'drop-collection' tool", [ - onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), - onlyCallsDropCollection("Drop movies collection from mflix database."), - ]), - ...describeSuite("should call 'drop-collection' after calling other necessary tools", [ - callsDropCollection("Remove books collection from which ever database contains it.", [ - { - toolName: "list-databases", - parameters: {}, +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), + onlyCallsDropCollection("Drop movies collection from mflix database."), + callsDropCollection("Remove books collection from which ever database contains it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", }, - { - toolName: "list-collections", - parameters: { - database: "admin", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", }, - { - toolName: "list-collections", - parameters: { - database: "comics", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", }, - { - toolName: "list-collections", - parameters: { - database: "config", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", }, - { - toolName: "list-collections", - parameters: { - database: "local", - }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", }, - { - toolName: "list-collections", - parameters: { - database: "mflix", - }, - }, - { - toolName: "drop-collection", - parameters: { - database: "comics", - collection: "books", - }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", }, - ]), + }, ]), -}); +]); diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts index 0518d982..53fc7fd5 100644 --- a/tests/accuracy/drop-database.test.ts +++ b/tests/accuracy/drop-database.test.ts @@ -1,13 +1,11 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "drop-database", @@ -21,30 +19,24 @@ function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls, }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'drop-database' tool", [ - onlyCallsDropDatabase("Remove mflix database from my cluster."), - onlyCallsDropDatabase("Drop database named mflix."), - ]), - ...describeSuite("should call 'drop-database' after calling other necessary tools", [ - callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ - { - toolName: "list-databases", - parameters: {}, - }, - { - toolName: "drop-database", - parameters: { - database: "mflix", - }, +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropDatabase("Remove mflix database from my cluster."), + onlyCallsDropDatabase("Drop database named mflix."), + callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", }, - ]), + }, ]), -}); +]); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts index 6e767981..4a539c48 100644 --- a/tests/accuracy/explain.test.ts +++ b/tests/accuracy/explain.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsExplain(prompt: string, method: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "explain", @@ -53,20 +51,14 @@ const callsExplainWithCount = (prompt: string) => * because we are using Zod.union, when we probably should've used * Zod.discriminatedUnion */ -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'explain' tool for a find query", [ - callsExplainWithFind( - `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), - ...describeSuite("should call 'explain' tool for an aggregation", [ - callsExplainWithAggregate( - `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), - ...describeSuite("should call 'explain' tool for count", [ - callsExplainWithCount( - `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsExplainWithFind( + `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithAggregate( + `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithCount( + `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), +]); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts index ecfbe4f3..02c02cd1 100644 --- a/tests/accuracy/find.test.ts +++ b/tests/accuracy/find.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -21,9 +19,7 @@ function callsFindNoFilter(prompt: string, database = "mflix", collection = "mov function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -39,9 +35,7 @@ function callsFindWithFilter(prompt: string, filter: Record): A function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -61,9 +55,7 @@ function callsFindWithProjectionAndFilters( projection: Record ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -85,9 +77,7 @@ function callsFindWithFilterSortAndLimit( limit: number ): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "find", @@ -103,27 +93,25 @@ function callsFindWithFilterSortAndLimit( }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call find tool", [ - callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), - callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), - callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { - runtime: { $lt: 100 }, - }), - callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { - director: "Christina Collins", - }), - callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), - callsFindWithProjectionAndFilters( - "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", - { title: "Certain Fish" }, - { cast: 1 } - ), - callsFindWithFilterSortAndLimit( - "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", - { genres: "Horror" }, - { runtime: 1 }, - 2 - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), + callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), + callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { + runtime: { $lt: 100 }, + }), + callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { + director: "Christina Collins", + }), + callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), + callsFindWithProjectionAndFilters( + "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + { title: "Certain Fish" }, + { cast: 1 } + ), + callsFindWithFilterSortAndLimit( + "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + { genres: "Horror" }, + { runtime: 1 }, + 2 + ), +]); diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts index 25d60017..4ce15bb8 100644 --- a/tests/accuracy/insert-many.test.ts +++ b/tests/accuracy/insert-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsInsertMany(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", @@ -35,9 +33,7 @@ function callsInsertMany(prompt: string): AccuracyTestConfig { function callsEmptyInsertMany(prompt: string) { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "insert-many", @@ -51,15 +47,13 @@ function callsEmptyInsertMany(prompt: string) { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'insert-many' tool", [ - callsInsertMany( - [ - "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", - "- id: an incremental number starting from 1", - "- name: a string of format 'name'", - ].join("\n") - ), - callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsInsertMany( + [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), +]); diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts index a8455418..78a14f34 100644 --- a/tests/accuracy/list-collections.test.ts +++ b/tests/accuracy/list-collections.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListCollections(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "list-collections", @@ -50,13 +48,9 @@ function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfi }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call list-collections tool", [ - callsListCollections("How many collections do I have in database mflix?"), - callsListCollections("List all the collections in my MongoDB database mflix."), - callsListCollections("Is there a shows collection in my MongoDB database mflix?"), - ]), - ...describeSuite("should call list-databases and list-collections tool", [ - callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsListCollections("How many collections do I have in database mflix?"), + callsListCollections("List all the collections in my MongoDB database mflix."), + callsListCollections("Is there a shows collection in my MongoDB database mflix?"), + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), +]); diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts index 0ef88712..97a8ce27 100644 --- a/tests/accuracy/list-databases.test.ts +++ b/tests/accuracy/list-databases.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsListDatabases(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "list-databases", @@ -16,10 +14,8 @@ function callsListDatabases(prompt: string): AccuracyTestConfig { }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call list-databases tool", [ - callsListDatabases("How many databases do I have?"), - callsListDatabases("List all the databases that I have in my clusters"), - callsListDatabases("Is there a mflix database in my cluster?"), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases that I have in my clusters"), + callsListDatabases("Is there a mflix database in my cluster?"), +]); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts index 4ca148b9..8b9d2193 100644 --- a/tests/accuracy/logs.test.ts +++ b/tests/accuracy/logs.test.ts @@ -1,31 +1,27 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [toolCall], }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should call 'logs' tool", [ - callsLogsTool("Were there any startup warnings for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "startupWarnings", - }, - }), - callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { - toolName: "mongodb-logs", - parameters: { - type: "global", - limit: 10, - }, - }), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsLogsTool("Were there any startup warnings for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }), + callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }), +]); diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts index d8d46025..549a02b9 100644 --- a/tests/accuracy/rename-collection.test.ts +++ b/tests/accuracy/rename-collection.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsRenameCollection(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "rename-collection", @@ -22,9 +20,7 @@ function callsRenameCollection(prompt: string): AccuracyTestConfig { function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "rename-collection", @@ -39,11 +35,9 @@ function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig }; } -describeAccuracyTests(getAvailableModels(), { - ...describeSuite("should only call 'rename-collection' tool", [ - callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), - callsRenameCollectionWithDropTarget( - "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." - ), - ]), -}); +describeAccuracyTests(getAvailableModels(), [ + callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), + callsRenameCollectionWithDropTarget( + "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." + ), +]); diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracy-scorer.ts new file mode 100644 index 00000000..2ae13e6c --- /dev/null +++ b/tests/accuracy/sdk/accuracy-scorer.ts @@ -0,0 +1,114 @@ +import diff from "microdiff"; +import { ExpectedToolCall, LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; + +/** + * Tool calling accuracy is a single number calculated based on two dimensions. + * 1. Did LLM call the right tool? + * 2. Did LLM call the tool with correct and required parameters? + * + * The number can be one of: + * - 0: When LLM: + * - did not call the right tool + * - did not call the tool with correct parameters + * - 0.75: When LLM: + * - called the right tool but hallucinated and called some extra tools as + * well or called the same tool but with different parameters + * - called the right tool but hallucinated and called it with some + * non-required parameters + * - 1: When LLM: + * - called exactly the tools that were expected + * - called the expected tools exactly with the expected parameters + * + * To calculate this number we must have: + * 1. a list of expected tool calls with their expected parameters + * 2. a list of LLM tool calls with their parameters + * + * For each expected tool call we find the best matching LLM tool call. Best + * matching LLM tool call will have: + * 1. the same name as that of the expected tool call + * 2. highest parameter similarity score, with at-least 0.75 to ensure an actual + * match. And in case of competing scores, we take the first one that appears + * in the LLM tool calls. + * + * Using the above logic we establish pairs between expected and actual tool + * calls. + * + * 1. If we could not pair some LLM tool calls with expected tool calls that + * means the LLM hallucinated over the extra tool calls. For that reason we + * will cap the maximum achievable accuracy to 0.75. + * + * 2. If we could not pair some expected tool calls with LLM tool calls that + * means the LLM did not call one of the expected tool required to solve the + * problem. For that reason we will mark the accuracy as 0 and exit early. + * + * 3. Now for each of the established tool call pairs, we will determine how + * correctly the parameters were called using the parameter similarity score. + * The parameter similarity score follow the same accuracy number pattern + * described above: + * - 0 : for missing parameters, incorrect parameter values + * - 0.75 : for additional parameters + * - 1 : for a perfect match + * + * The final accuracy score is then calculated as the least of: + * - Maximum achievable accuracy from #1 + * - The least of parameter similarity score from the established pairs in #3 + * + * For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts + */ +export function calculateToolCallingAccuracy( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: LLMToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return actualToolCalls.length === 0 ? 1 : 0.75; + } + + const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + + const individualAccuracies: number[] = []; + const checkedActualToolCallIndexes = new Set(); + + for (const expectedCall of expectedToolCalls) { + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: compareParams(expectedCall.parameters, call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score || a.index - b.index); + + const bestMatch = candidates[0]; + if (!bestMatch) { + individualAccuracies.push(0); + } else { + checkedActualToolCallIndexes.add(bestMatch.index); + const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); + individualAccuracies.push(individualAccuracy); + } + } + + return Math.min(...individualAccuracies); +} + +function compareParams(expected: Record, actual: Record): number { + const differences = diff(expected, actual); + + if (differences.length === 0) { + return 1; + } + + const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); + const hasRemovals = differences.some((d) => d.type === "REMOVE"); + const hasChanges = differences.some((d) => d.type === "CHANGE"); + + if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + return 0.75; + } + + return 0; +} diff --git a/tests/accuracy/sdk/accuracy-scorers.ts b/tests/accuracy/sdk/accuracy-scorers.ts deleted file mode 100644 index 612c3f80..00000000 --- a/tests/accuracy/sdk/accuracy-scorers.ts +++ /dev/null @@ -1,60 +0,0 @@ -import diff from "microdiff"; -import { ExpectedToolCall, ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; - -export function calculateToolCallingAccuracy( - expectedToolCalls: ExpectedToolCall[], - actualToolCalls: ActualToolCall[] -): number { - if (expectedToolCalls.length === 0) { - return actualToolCalls.length === 0 ? 1 : 0.75; - } - - const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; - - const individualAccuracies: number[] = []; - const checkedActualToolCallIndexes = new Set(); - - for (const expectedCall of expectedToolCalls) { - const candidates = actualToolCalls - .map((call, index) => ({ call, index })) - .filter( - ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName - ) - .map(({ call, index }) => ({ - call, - index, - score: compareParams(expectedCall.parameters, call.parameters), - })) - .filter(({ score }) => score >= 0.75) - .sort((a, b) => b.score - a.score); - - const bestMatch = candidates[0]; - if (!bestMatch) { - individualAccuracies.push(0); - } else { - checkedActualToolCallIndexes.add(bestMatch.index); - const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); - individualAccuracies.push(individualAccuracy); - } - } - - return Math.min(...individualAccuracies); -} - -function compareParams(expected: Record, actual: Record): number { - const differences = diff(expected, actual); - - if (differences.length === 0) { - return 1; - } - - const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); - const hasRemovals = differences.some((d) => d.type === "REMOVE"); - const hasChanges = differences.some((d) => d.type === "CHANGE"); - - if (hasOnlyAdditions && !hasRemovals && !hasChanges) { - return 0.75; - } - - return 0; -} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts index a4d2bea0..a919e8f0 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -1,6 +1,4 @@ -import path from "path"; import fs from "fs/promises"; -import { fileURLToPath } from "url"; import { AccuracyRunStatus, AccuracyRunStatuses, @@ -8,10 +6,7 @@ import { AccuracySnapshotEntrySchema, AccuracySnapshotStorage, } from "./snapshot-storage.js"; -const __dirname = fileURLToPath(import.meta.url); -const rootDir = path.resolve(__dirname, "..", "..", "..", "..", ".."); -const snapshotsDir = path.resolve(rootDir, ".accuracy-snapshots"); -export const snapshotFilePath = path.resolve(snapshotsDir, "snapshots.json"); +import { GENERATED_ASSETS_DIR, LOCAL_SNAPSHOTS_FILE } from "../constants.js"; export class DiskSnapshotStorage implements AccuracySnapshotStorage { async createSnapshotEntry( @@ -21,7 +16,6 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" @@ -95,14 +89,14 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { } private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { - const tmp = `${snapshotFilePath}~${Date.now()}`; + const tmp = `${LOCAL_SNAPSHOTS_FILE}~${Date.now()}`; await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); - await fs.rename(tmp, snapshotFilePath); + await fs.rename(tmp, LOCAL_SNAPSHOTS_FILE); } private async readSnapshot(): Promise { try { - const raw = await fs.readFile(snapshotFilePath, "utf8"); + const raw = await fs.readFile(LOCAL_SNAPSHOTS_FILE, "utf8"); return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); } catch (e: unknown) { if ((e as { code: string }).code === "ENOENT") { @@ -117,7 +111,7 @@ export class DiskSnapshotStorage implements AccuracySnapshotStorage { } static async getStorage() { - await fs.mkdir(snapshotsDir, { recursive: true }); + await fs.mkdir(GENERATED_ASSETS_DIR, { recursive: true }); return new DiskSnapshotStorage(); } } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts index 3bec4c53..da67aa60 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -1,20 +1,7 @@ -import { getCommitSHA } from "../git-info.js"; import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; import { AccuracySnapshotStorage } from "./snapshot-storage.js"; export async function getAccuracySnapshotStorage(): Promise { - const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; - if (!accuracyRunId) { - throw new Error( - "Cannot create AccuracySnapshotStorage without an accuracyRunId - ensure that the relevant env variable is present." - ); - } - - const commitSHA = await getCommitSHA(); - if (!commitSHA) { - throw new Error("Cannot create AccuracySnapshotStorage without a commitSHA."); - } - return MongoDBSnapshotStorage.getStorage() ?? (await DiskSnapshotStorage.getStorage()); } diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index 2138b4f0..d3b1b56a 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -30,7 +30,6 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts index e7833456..e0a6966d 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -1,13 +1,14 @@ import z from "zod"; -const ExpectedToolCallSchema = z.object({ +const LLMToolCallSchema = z.object({ + toolCallId: z.string(), toolName: z.string(), parameters: z.record(z.string(), z.unknown()), }); -export type ExpectedToolCall = z.infer; +export type LLMToolCall = z.infer; -const ActualToolCallSchema = ExpectedToolCallSchema.extend({ toolCallId: z.string() }); -export type ActualToolCall = z.infer; +const ExpectedToolCallSchema = LLMToolCallSchema.omit({ toolCallId: true }); +export type ExpectedToolCall = z.infer; export const AccuracyRunStatus = { Done: "done", @@ -18,23 +19,58 @@ export const AccuracyRunStatus = { export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; export const AccuracySnapshotEntrySchema = z.object({ - // Git and meta information for snapshot entries + /** + * A unique id for each accuracy run. Should either be generated by the + * script triggering the accuracy run or provided via environment variables. + * */ accuracyRunId: z.string(), + + /** + * Represents the status of accuracy run. Each test completion, during an + * accuracy run, is supposed to submit an accuracy snapshot entry with + * InProgress status which then later, after completion of accuracy run, is + * updated to either Done or Failed, depending on whether there were errors + * during the run or not. */ accuracyRunStatus: z .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) .default(AccuracyRunStatus.InProgress), + + /** Timestamp of when this snapshot entry was generated. */ createdOn: z.number(), + + /** The commit SHA for which the accuracy run was triggered. */ commitSHA: z.string(), - // Accuracy info + + /** The LLM provider providing the LLM APIs */ provider: z.string(), + + /** The LLM which was requested to respond to our test prompts */ requestedModel: z.string(), - test: z.string(), + + /** The actual prompt that was provided to LLM as test */ prompt: z.string(), + + /** A number between 0 and 1, representing how accurately the expected tools + * were called by LLM when responding to the provided prompts. To know more + * about how this number is generated, check - toolCallingAccuracy.ts */ toolCallingAccuracy: z.number(), - // debug info for further investigations + + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ expectedToolCalls: ExpectedToolCallSchema.array(), - actualToolCalls: ActualToolCallSchema.array(), + + /** + * A list of tools, along with their parameters, that were actually called + * by the LLM in test. */ + actualToolCalls: LLMToolCallSchema.array(), + + /** + * The total time taken by LLM to respond to our prompt. */ llmResponseTime: z.number(), + + /** + * Token usage data, returned as part of LLM prompt response. */ tokensUsage: z .object({ promptTokens: z.number().optional(), @@ -42,8 +78,20 @@ export const AccuracySnapshotEntrySchema = z.object({ totalTokens: z.number().optional(), }) .optional(), + + /** + * The ID of the model that actually responded to our prompt request. */ respondingModel: z.string(), + + /** + * The final response text generated by the LLM, in response to our prompt + * request. */ text: z.string(), + + /** + * A list of messages, exchanged between LLM and our testing agent, in + * response to our prompt request. This is particularly helpful for + * debugging. */ messages: z.array(z.record(z.string(), z.unknown())), }); @@ -57,7 +105,6 @@ export interface AccuracySnapshotStorage { | "commitSHA" | "provider" | "requestedModel" - | "test" | "prompt" | "toolCallingAccuracy" | "expectedToolCalls" diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts index 4a8ad279..d2486942 100644 --- a/tests/accuracy/sdk/accuracy-testing-client.ts +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -1,22 +1,27 @@ -import path from "path"; import { v4 as uuid } from "uuid"; -import { fileURLToPath } from "url"; import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; -import { ActualToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; - -const __dirname = fileURLToPath(import.meta.url); -const distPath = path.join(__dirname, "..", "..", "..", "..", "dist"); -const cliScriptPath = path.join(distPath, "index.js"); +import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; +import { LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; export type MockedTools = Record; +/** + * AccuracyTestingClient is a bridge between actual MCP client connected to our + * MCP server and our Tool calling agent. Its serves the following purposes: + * 1. Captures actual tools provided by our MCP server + * 2. Translates captured MCP tools to tool definitions that can be consumed by + * Tool Calling agent (Ref: `vercelTools`) + * 3. Allow dynamic mocking and resetting of mocks of individual tool calls. + * 4. Records and provides tool calls made by LLMs with their parameters. + */ export class AccuracyTestingClient { private mockedTools: MockedTools = {}; - private recordedToolCalls: ActualToolCall[] = []; + private llmToolCalls: LLMToolCall[] = []; + private constructor(private readonly vercelMCPClient: Awaited>) {} async close() { @@ -30,7 +35,7 @@ export class AccuracyTestingClient { rewrappedVercelTools[toolName] = createVercelTool({ ...tool, execute: async (args, options) => { - this.recordedToolCalls.push({ + this.llmToolCalls.push({ toolCallId: uuid(), toolName: toolName, parameters: args as Record, @@ -44,10 +49,10 @@ export class AccuracyTestingClient { return await tool.execute(args, options); } catch (error) { // There are cases when LLM calls the tools incorrectly - // and the schema definition check fails. Normally a - // tool calling agent will handle the error case but - // because we are wrapping the tool definition ourselves - // we have to handle this ourselves as well. + // and the schema definition check fails. In production, + // the tool calling agents are deployed with this fail + // safe to allow LLM to course correct themselves. That + // is exactly what we do here as well. return { isError: true, content: JSON.stringify(error), @@ -60,8 +65,8 @@ export class AccuracyTestingClient { return rewrappedVercelTools; } - getToolCalls() { - return this.recordedToolCalls; + getLLMToolCalls() { + return this.llmToolCalls; } mockTools(mockedTools: MockedTools) { @@ -70,13 +75,13 @@ export class AccuracyTestingClient { resetForTests() { this.mockTools({}); - this.recordedToolCalls = []; + this.llmToolCalls = []; } static async initializeClient(mdbConnectionString: string) { const clientTransport = new StdioClientTransport({ command: process.execPath, - args: [cliScriptPath, "--connectionString", mdbConnectionString], + args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString], }); const client = await createMCPClient({ diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts index 4b5d2621..ee0b5f7f 100644 --- a/tests/accuracy/sdk/agent.ts +++ b/tests/accuracy/sdk/agent.ts @@ -10,14 +10,13 @@ const systemPrompt = [ 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', ]; -// Some necessary types from Vercel SDK +// These types are not exported by Vercel SDK so we derive them here to be +// re-used again. export type VercelMCPClient = Awaited>; export type VercelMCPClientTools = Awaited>; export type VercelAgent = ReturnType; -// Generic interface for Agent, in case we need to switch to some other agent -// development SDK -export interface AgentPromptResult { +export interface VercelAgentPromptResult { respondingModel: string; tokensUsage?: { promptTokens?: number; @@ -27,18 +26,21 @@ export interface AgentPromptResult { text: string; messages: Record[]; } + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK export interface Agent { prompt(prompt: string, model: Model, tools: Tools): Promise; } export function getVercelToolCallingAgent( requestedSystemPrompt?: string -): Agent, VercelMCPClientTools, AgentPromptResult> { +): Agent, VercelMCPClientTools, VercelAgentPromptResult> { return { async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { const result = await generateText({ model: model.getModel(), - system: [...systemPrompt, requestedSystemPrompt].join("\n"), + system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"), prompt, tools, maxSteps: 100, diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts new file mode 100644 index 00000000..cd46a306 --- /dev/null +++ b/tests/accuracy/sdk/constants.ts @@ -0,0 +1,18 @@ +import path from "path"; +import { fileURLToPath } from "url"; + +const __dirname = fileURLToPath(import.meta.url); + +export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); + +export const DIST_DIR = path.join(ROOT_DIR, "dist"); + +export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); + +export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); + +export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); + +export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); + +export const HTML_REPORT_FILE = path.join(GENERATED_ASSETS_DIR, "report.html"); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts index 1dd6d971..2a358ce1 100644 --- a/tests/accuracy/sdk/describe-accuracy-tests.ts +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -1,5 +1,5 @@ import { TestableModels } from "./models.js"; -import { calculateToolCallingAccuracy } from "./accuracy-scorers.js"; +import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; @@ -8,25 +8,39 @@ import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-s import { getCommitSHA } from "./git-info.js"; export interface AccuracyTestConfig { - systemPrompt?: string; - injectConnectedAssumption?: boolean; + /** The prompt to be provided to LLM for evaluation. */ prompt: string; + + /** + * A list of tools and their parameters that we expect LLM to call based on + * how vague or detailed the prompt is. Ideally this should be a list of + * bare minimum and critical tool calls that are required to solve the + * problem mentioned in the prompt but because, for even a slightly vague + * prompt, LLM might decide to do additional confirmation by calling other + * tools, its fine to include those other tool calls as well to get a + * perfect 1 on the tool calling accuracy score. */ expectedToolCalls: ExpectedToolCall[]; - mockedTools: MockedTools; -} -export function describeSuite(suiteName: string, testConfigs: AccuracyTestConfig[]) { - return { - [suiteName]: testConfigs, - }; + /** + * The additional system prompt to be appended to already injected system + * prompt. */ + systemPrompt?: string; + + /** + * A small hint appended to the actual prompt in test, which is supposed to + * hint LLM to assume that the MCP server is already connected so that it + * does not call the connect tool. + * By default it is assumed to be true */ + injectConnectedAssumption?: boolean; + + /** + * A map of tool names to their mocked implementation. When the mocked + * implementations are available, the testing client will prefer those over + * actual MCP tool calls. */ + mockedTools?: MockedTools; } -export function describeAccuracyTests( - models: TestableModels, - accuracyTestConfigs: { - [suiteName: string]: AccuracyTestConfig[]; - } -) { +export function describeAccuracyTests(models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[]) { if (!process.env.MDB_ACCURACY_RUN_ID) { throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); } @@ -36,13 +50,12 @@ export function describeAccuracyTests( } const eachModel = describe.each(models); - const eachSuite = describe.each(Object.keys(accuracyTestConfigs)); eachModel(`$displayName`, function (model) { + const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`; const mdbIntegration = setupMongoDBIntegrationTest(); const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); - const accuracyRunId: string = `${process.env.MDB_ACCURACY_RUN_ID}`; let commitSHA: string; let accuracySnapshotStorage: AccuracySnapshotStorage; let testMCPClient: AccuracyTestingClient; @@ -53,8 +66,8 @@ export function describeAccuracyTests( if (!retrievedCommitSHA) { throw new Error("Could not derive commitSHA, exiting accuracy tests!"); } - commitSHA = retrievedCommitSHA; + accuracySnapshotStorage = await getAccuracySnapshotStorage(); testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); agent = getVercelToolCallingAgent(); @@ -67,40 +80,39 @@ export function describeAccuracyTests( }); afterAll(async () => { - await accuracySnapshotStorage.close(); - await testMCPClient.close(); + await accuracySnapshotStorage?.close(); + await testMCPClient?.close(); }); - eachSuite("%s", function (suiteName) { - const eachTest = it.each(accuracyTestConfigs[suiteName] ?? []); - - eachTest("$prompt", async function (testConfig) { - testMCPClient.mockTools(testConfig.mockedTools); - const toolsForModel = await testMCPClient.vercelTools(); - const promptForModel = testConfig.injectConnectedAssumption - ? [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" ") - : testConfig.prompt; - - const timeBeforePrompt = Date.now(); - const result = await agent.prompt(promptForModel, model, toolsForModel); - const timeAfterPrompt = Date.now(); - const toolCalls = testMCPClient.getToolCalls(); - const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, toolCalls); - - const responseTime = timeAfterPrompt - timeBeforePrompt; - await accuracySnapshotStorage.createSnapshotEntry({ - accuracyRunId, - commitSHA, - provider: model.provider, - requestedModel: model.modelName, - test: suiteName, - prompt: testConfig.prompt, - llmResponseTime: responseTime, - toolCallingAccuracy: toolCallingAccuracy, - actualToolCalls: toolCalls, - expectedToolCalls: testConfig.expectedToolCalls, - ...result, - }); + const eachTest = it.each(accuracyTestConfigs); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools ?? {}); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = + testConfig.injectConnectedAssumption === false + ? testConfig.prompt + : [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" "); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); + + const llmToolCalls = testMCPClient.getLLMToolCalls(); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracySnapshotStorage.createSnapshotEntry({ + accuracyRunId, + commitSHA, + provider: model.provider, + requestedModel: model.modelName, + prompt: testConfig.prompt, + llmResponseTime: responseTime, + toolCallingAccuracy: toolCallingAccuracy, + actualToolCalls: llmToolCalls, + expectedToolCalls: testConfig.expectedToolCalls, + ...result, }); }); }); diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts index 70b80435..9f47028f 100644 --- a/tests/accuracy/sdk/models.ts +++ b/tests/accuracy/sdk/models.ts @@ -14,11 +14,10 @@ export interface Model

{ export class OpenAIModel implements Model { readonly provider = "OpenAI"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -34,11 +33,10 @@ export class OpenAIModel implements Model { export class AzureOpenAIModel implements Model { readonly provider = "Azure"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -56,11 +54,10 @@ export class AzureOpenAIModel implements Model { export class GeminiModel implements Model { readonly provider = "Google"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { @@ -76,11 +73,10 @@ export class GeminiModel implements Model { export class OllamaModel implements Model { readonly provider = "Ollama"; + readonly displayName: string; - constructor(readonly modelName: string) {} - - get displayName(): string { - return `${this.provider} - ${this.modelName}`; + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; } isAvailable(): boolean { diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts index 4b82fbfb..86f96705 100644 --- a/tests/accuracy/update-many.test.ts +++ b/tests/accuracy/update-many.test.ts @@ -1,12 +1,10 @@ -import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js"; +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; import { getAvailableModels } from "./sdk/models.js"; import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "update-many", @@ -26,9 +24,7 @@ function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { return { - injectConnectedAssumption: true, prompt: prompt, - mockedTools: {}, expectedToolCalls: [ { toolName: "update-many", @@ -47,14 +43,12 @@ function callsUpdateManyWithFilters(prompt: string, filter: Record { + describe("edge cases", () => { + it("should return 1 when both expected and actual are empty", () => { + const result = calculateToolCallingAccuracy([], []); + expect(result).toBe(1); + }); + + it("should return 0.75 when expected is empty but actual has tool calls", () => { + const actualToolCalls: LLMToolCall[] = [{ toolCallId: "1", toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy([], actualToolCalls); + expect(result).toBe(0.75); + }); + + it("should return 0 when expected has tool calls but actual is empty", () => { + const expectedToolCalls: ExpectedToolCall[] = [{ toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy(expectedToolCalls, []); + expect(result).toBe(0); + }); + }); + + describe("perfect matches", () => { + it("should return 1 for exact match with nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for exact match with multiple diverse tool calls", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolCallId: "3", toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + }); + + describe("additional parameters", () => { + it("should return 0.75 when tool call has additional nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + db: "test", + collection: "users", + filter: { status: "active", age: { $gte: 18 } }, + limit: 10, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing or incorrect parameters", () => { + it("should return 0 when tool call has missing nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when aggregate tool call has incorrect pipeline", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $lt: 50 } } }] }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + }); + + describe("additional tool calls", () => { + it("should cap accuracy at 0.75 when LLM calls extra tools", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { toolCallId: "2", toolName: "count", parameters: { db: "test", collection: "orders" } }, + { + toolCallId: "3", + toolName: "aggregate", + parameters: { + db: "test", + collection: "products", + pipeline: [{ $group: { _id: "$category", total: { $sum: 1 } } }], + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + + it("should cap accuracy at 0.75 when LLM calls same tool multiple times with variations", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + { toolCallId: "3", toolName: "find", parameters: { db: "test", collection: "users", limit: 10 } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing tool calls", () => { + it("should return 0 if any expected tool call was not called", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + // Missing the aggregate tool call + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); // One expected tool call was not called + }); + }); +}); From d7b1c57b71e2b5fcf206fd909739f00a8ea024d7 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 10:52:36 +0200 Subject: [PATCH 39/47] chore: generate accuracy test summary post test --- resources/test-summary-template.html | 337 +++++++++++++++++++++++++++ scripts/generate-test-summary.ts | 156 +++++++++++++ scripts/run-accuracy-tests.sh | 1 + tests/accuracy/sdk/constants.ts | 6 +- 4 files changed, 499 insertions(+), 1 deletion(-) create mode 100644 resources/test-summary-template.html create mode 100644 scripts/generate-test-summary.ts diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html new file mode 100644 index 00000000..318e9550 --- /dev/null +++ b/resources/test-summary-template.html @@ -0,0 +1,337 @@ + + + + + + MongoDB MCP Server - Accuracy Test Summary + + + +

+

šŸ“Š MongoDB MCP Server - Accuracy Test Summary

+
+

Run Information & Summary

+
+
+
Accuracy Run ID
+
{{accuracyRunId}}
+
+
+
Accuracy Run Status
+
{{runStatusUpper}}
+
+
+
Commit SHA
+
{{commitSHA}}
+
+
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+
Snapshots Captured On
+
{{createdOn}}
+
+
+
Total Prompts Evaluated
+
{{totalTests}}
+
+
+
Models Tested
+
{{modelsCount}}
+
+
+
Evals with 0% Accuracy
+
{{testsWithZeroAccuracy}}
+
+
+
+ + + + + + + + + + + + + + {{tableRows}} + +
PromptModelExpected Tool CallsLLM Tool CallsAccuracyLLM Response Time (ms)Total Tokens Used
+
+ + + diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts new file mode 100644 index 00000000..efeacbdc --- /dev/null +++ b/scripts/generate-test-summary.ts @@ -0,0 +1,156 @@ +import { readFile, writeFile } from "fs/promises"; +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; +import type { + AccuracySnapshotEntry, + ExpectedToolCall, + LLMToolCall, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function populateTemplate(template: string, data: Record): string { + return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); +} + +function formatAccuracy(accuracy: number): string { + return (accuracy * 100).toFixed(1) + "%"; +} + +function getAccuracyClass(accuracy: number): string { + if (accuracy === 1) return "accuracy-perfect"; + if (accuracy >= 0.75) return "accuracy-good"; + return "accuracy-poor"; +} + +function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string { + return toolCalls + .map((call) => { + const params = JSON.stringify(call.parameters, null, 2); + return `${call.toolName}`; + }) + .join(", "); +} + +function formatTokenUsage(tokensUsage: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +}): string { + const total = tokensUsage.totalTokens || 0; + const prompt = tokensUsage.promptTokens || 0; + const completion = tokensUsage.completionTokens || 0; + + const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`; + return `${total}`; +} + +function formatMessages(messages: Array>): string { + return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); +} + +async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accuracyRunId: string): Promise { + const totalPrompts = snapshotEntries.length; + const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + + const firstSnapshotEntry = snapshotEntries[0]; + const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; + const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; + const createdOn = firstSnapshotEntry?.createdOn + ? new Date(firstSnapshotEntry.createdOn).toLocaleString() + : "unknown"; + const reportGeneratedOn = new Date().toLocaleString(); + + const tableRows = snapshotEntries + .map( + (snapshotEntry, index) => ` + + + ā–¶ + ${snapshotEntry.prompt} + + ${snapshotEntry.provider} - ${snapshotEntry.requestedModel} + ${formatToolCallsWithTooltip(snapshotEntry.expectedToolCalls)} + ${formatToolCallsWithTooltip(snapshotEntry.actualToolCalls)} + + + ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} + + + ${snapshotEntry.llmResponseTime.toFixed(2)} + ${formatTokenUsage(snapshotEntry.tokensUsage || {})} + + + +
+
+

šŸ¤– LLM Response

+
${snapshotEntry.text}
+
+
+

šŸ’¬ Conversation Messages

+
${formatMessages(snapshotEntry.messages)}
+
+
+ + + ` + ) + .join(""); + + // Read template file + const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); + // Fill template + return populateTemplate(template, { + accuracyRunId, + runStatus, + runStatusUpper: runStatus.toUpperCase(), + commitSHA, + reportGeneratedOn, + createdOn, + totalTests: String(totalPrompts), + modelsCount: String(modelsCount), + testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), + tableRows, + }); +} + +async function generateTestSummary(): Promise { + try { + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + if (!accuracyRunId) { + throw new Error("Cannot generate test summary, accuracy run id is unknown"); + } + console.log(`\nšŸ“Š Generating test summary for accuracy run: ${accuracyRunId}\n`); + + const storage = await getAccuracySnapshotStorage(); + const snapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); + await storage.close(); + + if (snapshot.length === 0) { + console.log("No snapshots found for the current run."); + return; + } + + const htmlReport = await generateHtmlReport(snapshot, accuracyRunId); + + const reportPath = HTML_TESTS_SUMMARY_FILE; + await writeFile(reportPath, htmlReport, "utf8"); + + console.log(`āœ… HTML report generated: ${reportPath}`); + + const totalPrompts = snapshot.length; + const modelsCount = new Set(snapshot.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshot.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + + console.log(`\nšŸ“ˆ Summary:`); + console.log(` Total prompts evaluated: ${totalPrompts}`); + console.log(` Models tested: ${modelsCount}`); + console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); + console.log(` Report saved to: ${reportPath}\n`); + } catch (error) { + console.error("Error generating test summary:", error); + process.exit(1); + } +} + +void generateTestSummary(); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index e009661f..d6df473f 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -39,6 +39,7 @@ JEST_EXIT_CODE=$? # want to compare against an incomplete run. if [ $JEST_EXIT_CODE -eq 0 ]; then MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" + npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" else MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" fi diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts index cd46a306..0598b1a7 100644 --- a/tests/accuracy/sdk/constants.ts +++ b/tests/accuracy/sdk/constants.ts @@ -7,6 +7,8 @@ export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); export const DIST_DIR = path.join(ROOT_DIR, "dist"); +export const RESOURCES_DIR = path.join(ROOT_DIR, "resources"); + export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); @@ -15,4 +17,6 @@ export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); -export const HTML_REPORT_FILE = path.join(GENERATED_ASSETS_DIR, "report.html"); +export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html"); + +export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); From 6c25f1b30ac8f17c7c55f03c7c27b780d7eabab2 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 11:55:15 +0200 Subject: [PATCH 40/47] chore: add Github workflow to trigger test runs --- .github/workflows/accuracy-tests.yml | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/accuracy-tests.yml diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml new file mode 100644 index 00000000..89188c16 --- /dev/null +++ b/.github/workflows/accuracy-tests.yml @@ -0,0 +1,44 @@ +name: Accuracy Tests + +on: + workflow_dispatch: + pull_request: + types: [labeled] + +jobs: + run-accuracy-tests: + name: Run Accuracy Tests + runs-on: ubuntu-latest + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') + env: + MDB_OPEN_AI_API_KEY: ${{ secrets.MDB_OPEN_AI_API_KEY }} + MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} + MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} + MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} + MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + steps: + - uses: GitHubSecurityLab/actions-permissions/monitor@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version-file: package.json + cache: 'npm' + - name: Install dependencies + run: npm ci + - name: Run accuracy tests + run: ./scripts/run-accuracy-tests.sh + - name: Upload accuracy test summary + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-test-summary + path: .accuracy/tests-summary.html + - name: Comment summary on PR + if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' + uses: marocchino/sticky-pull-request-comment@v2 + with: + path: .accuracy/tests-summary.html \ No newline at end of file From 6da95382d42063173409c81d3aa3a9407a997c29 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 13:21:42 +0200 Subject: [PATCH 41/47] chore: fix permissions issue --- .github/workflows/accuracy-tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 89188c16..955b792a 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -9,6 +9,9 @@ jobs: run-accuracy-tests: name: Run Accuracy Tests runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') @@ -26,7 +29,7 @@ jobs: - uses: actions/setup-node@v4 with: node-version-file: package.json - cache: 'npm' + cache: "npm" - name: Install dependencies run: npm ci - name: Run accuracy tests @@ -41,4 +44,4 @@ jobs: if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' uses: marocchino/sticky-pull-request-comment@v2 with: - path: .accuracy/tests-summary.html \ No newline at end of file + path: .accuracy/tests-summary.html From 6ccaa11590f56b888e532870689628805181b4e5 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 13:35:27 +0200 Subject: [PATCH 42/47] chore: bring back packages post merge --- package-lock.json | 9 +++++++++ package.json | 10 ++++++++++ scripts/run-accuracy-tests.sh | 2 +- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/package-lock.json b/package-lock.json index 9a4282e7..a3bf47c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -30,7 +30,11 @@ "mongodb-mcp-server": "dist/index.js" }, "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@jest/globals": "^30.0.4", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", @@ -38,6 +42,7 @@ "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-jest": "^29.0.1", @@ -46,14 +51,18 @@ "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jest-extended": "^6.0.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "simple-git": "^3.28.0", "ts-jest": "^29.4.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "engines": { diff --git a/package.json b/package.json index e978f7bf..53639aec 100644 --- a/package.json +++ b/package.json @@ -30,11 +30,16 @@ "reformat": "prettier --write .", "generate": "./scripts/generate.sh", "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", + "pre:test:accuracy": "npm run build:compile", "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@jest/globals": "^30.0.4", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", @@ -42,6 +47,7 @@ "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-jest": "^29.0.1", @@ -50,14 +56,18 @@ "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jest-extended": "^6.0.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "simple-git": "^3.28.0", "ts-jest": "^29.4.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "dependencies": { diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh index d6df473f..ae02dd06 100644 --- a/scripts/run-accuracy-tests.sh +++ b/scripts/run-accuracy-tests.sh @@ -18,7 +18,7 @@ export MDB_ACCURACY_RUN_ID=$(npx uuid v4) # npm run test:accuracy -- tests/accuracy/some-test.test.ts TEST_PATH_PATTERN="${1:-tests/accuracy}" shift || true -node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPattern "$TEST_PATH_PATTERN" "$@" +node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" # Preserving the exit code from test run to correctly notify in the CI # environments when the tests fail. From 865dbfe2667c7b6456fbaf88490b6b4f6985124b Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Thu, 10 Jul 2025 22:53:50 +0200 Subject: [PATCH 43/47] chore: update report generation to include comparison with baseline as well --- .github/workflows/accuracy-tests.yml | 1 + resources/test-summary-template.html | 104 ++++++++++-- scripts/generate-test-summary.ts | 158 ++++++++++++++++-- .../mdb-snapshot-storage.ts | 2 +- 4 files changed, 234 insertions(+), 31 deletions(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 955b792a..640fdd1a 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -23,6 +23,7 @@ jobs: MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 - uses: actions/checkout@v4 diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html index 318e9550..903457f8 100644 --- a/resources/test-summary-template.html +++ b/resources/test-summary-template.html @@ -31,16 +31,30 @@ background: #f8f9fa; padding: 20px; border-radius: 6px; - margin-bottom: 30px; + margin-bottom: 20px; border-left: 4px solid #00684a; } + .header-info:nth-child(3) { + border-left-color: #007bff; + } + .header-info:nth-child(4) { + border-left-color: #28a745; + } .header-info h2 { margin-top: 0; + margin-bottom: 15px; color: #00684a; + font-size: 1.2em; + } + .header-info:nth-child(3) h2 { + color: #007bff; + } + .header-info:nth-child(4) h2 { + color: #28a745; } .info-grid { display: grid; - grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 15px; } @@ -158,22 +172,22 @@ .accuracy-perfect { background-color: #d4edda; color: #155724; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .accuracy-good { background-color: #fff3cd; color: #856404; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .accuracy-poor { background-color: #f8d7da; color: #721c24; - padding: 4px 8px; - border-radius: 4px; + padding: 2px 6px; + border-radius: 3px; font-weight: bold; } .tool-call { @@ -215,6 +229,29 @@ min-width: 80px; text-align: center; } + .baseline-accuracy-cell { + width: 8%; + min-width: 80px; + text-align: center; + } + .accuracy-comparison { + background: #e9ecef; + padding: 2px 6px; + border-radius: 3px; + font-weight: bold; + } + .accuracy-improved { + background: #d4edda; + color: #155724; + } + .accuracy-regressed { + background: #f8d7da; + color: #721c24; + } + .accuracy-same { + background: #e2e3e5; + color: #495057; + } .response-time-cell { width: 10%; min-width: 100px; @@ -264,28 +301,30 @@

šŸ“Š MongoDB MCP Server - Accuracy Test Summary

-

Run Information & Summary

+

šŸ“Š Current Run Information

Accuracy Run ID
{{accuracyRunId}}
-
-
Accuracy Run Status
-
{{runStatusUpper}}
-
Commit SHA
{{commitSHA}}
-
Report Generated On
-
{{reportGeneratedOn}}
+
Run Created On
+
{{createdOn}}
-
Snapshots Captured On
-
{{createdOn}}
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+ +
+

šŸ“ˆ Test Results Summary

+
Total Prompts Evaluated
{{totalTests}}
@@ -298,6 +337,36 @@

Run Information & Summary

Evals with 0% Accuracy
{{testsWithZeroAccuracy}}
+
+
Average Accuracy
+
{{averageAccuracy}}
+
+
+
+ +
+

šŸ”„ Baseline Comparison

+
+
+
Baseline Accuracy Run ID
+
{{baselineAccuracyRunId}}
+
+
+
Baseline Commit SHA
+
{{baselineCommitSHA}}
+
+
+
Baseline Run Created On
+
{{baselineCreatedOn}}
+
+
+
Evals Improved vs Baseline
+
{{evalsImproved}}
+
+
+
Evals Regressed vs Baseline
+
{{evalsRegressed}}
+
@@ -308,6 +377,7 @@

Run Information & Summary

+ diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts index efeacbdc..fba40610 100644 --- a/scripts/generate-test-summary.ts +++ b/scripts/generate-test-summary.ts @@ -7,6 +7,15 @@ import type { LLMToolCall, } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; +interface BaselineComparison { + baselineAccuracy?: number; + comparisonResult?: "improved" | "regressed" | "same"; +} + +interface SnapshotEntryWithBaseline extends AccuracySnapshotEntry { + baseline?: BaselineComparison; +} + function populateTemplate(template: string, data: Record): string { return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); } @@ -47,11 +56,91 @@ function formatMessages(messages: Array>): string { return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); } -async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accuracyRunId: string): Promise { +function formatBaselineAccuracy(snapshot: SnapshotEntryWithBaseline): string { + if (!snapshot.baseline || snapshot.baseline.baselineAccuracy === undefined) { + return 'N/A'; + } + + const baselineAccuracyText = formatAccuracy(snapshot.baseline.baselineAccuracy); + let comparisonClass = "accuracy-comparison"; + let comparisonIcon = ""; + + if (snapshot.baseline.comparisonResult) { + switch (snapshot.baseline.comparisonResult) { + case "improved": + comparisonClass += " accuracy-improved"; + comparisonIcon = " ↗"; + break; + case "regressed": + comparisonClass += " accuracy-regressed"; + comparisonIcon = " ā†˜"; + break; + case "same": + comparisonClass += " accuracy-same"; + comparisonIcon = " →"; + break; + } + } + + return `${baselineAccuracyText}${comparisonIcon}`; +} + +function compareSnapshotEntries( + currentSnapshotEntries: AccuracySnapshotEntry[], + baselineSnapshotEntries: AccuracySnapshotEntry[] +): SnapshotEntryWithBaseline[] { + const baselineMap = new Map(); + baselineSnapshotEntries.forEach((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + baselineMap.set(key, entry); + }); + + return currentSnapshotEntries.map((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + const baselineEntry = baselineMap.get(key); + + if (!baselineEntry) { + return entry; + } + + let comparisonResult: "improved" | "regressed" | "same"; + if (entry.toolCallingAccuracy > baselineEntry.toolCallingAccuracy) { + comparisonResult = "improved"; + } else if (entry.toolCallingAccuracy < baselineEntry.toolCallingAccuracy) { + comparisonResult = "regressed"; + } else { + comparisonResult = "same"; + } + + return { + ...entry, + baseline: { + baselineAccuracy: baselineEntry.toolCallingAccuracy, + comparisonResult, + }, + }; + }); +} + +async function generateHtmlReport( + snapshotEntries: SnapshotEntryWithBaseline[], + accuracyRunId: string, + baselineInfo?: { + commitSHA: string; + accuracyRunId: string; + createdOn: string; + } +): Promise { const totalPrompts = snapshotEntries.length; const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + const totalAccuracy = snapshotEntries.reduce((sum, entry) => sum + entry.toolCallingAccuracy, 0); + const averageAccuracy = totalPrompts > 0 ? totalAccuracy / totalPrompts : 0; + + const evalsImproved = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "regressed").length; + const firstSnapshotEntry = snapshotEntries[0]; const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; @@ -76,11 +165,12 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} + -
Expected Tool Calls LLM Tool Calls AccuracyBaseline Accuracy LLM Response Time (ms) Total Tokens Used
${formatBaselineAccuracy(snapshotEntry)} ${snapshotEntry.llmResponseTime.toFixed(2)} ${formatTokenUsage(snapshotEntry.tokensUsage || {})}
+

šŸ¤– LLM Response

@@ -97,9 +187,7 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu ) .join(""); - // Read template file const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); - // Fill template return populateTemplate(template, { accuracyRunId, runStatus, @@ -110,6 +198,12 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu totalTests: String(totalPrompts), modelsCount: String(modelsCount), testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), + averageAccuracy: formatAccuracy(averageAccuracy), + baselineCommitSHA: baselineInfo?.commitSHA || "N/A", + baselineAccuracyRunId: baselineInfo?.accuracyRunId || "N/A", + baselineCreatedOn: baselineInfo?.createdOn || "N/A", + evalsImproved: String(evalsImproved), + evalsRegressed: String(evalsRegressed), tableRows, }); } @@ -117,36 +211,74 @@ async function generateHtmlReport(snapshotEntries: AccuracySnapshotEntry[], accu async function generateTestSummary(): Promise { try { const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + const baselineCommitSHA = process.env.MDB_ACCURACY_BASELINE_COMMIT; + if (!accuracyRunId) { throw new Error("Cannot generate test summary, accuracy run id is unknown"); } console.log(`\nšŸ“Š Generating test summary for accuracy run: ${accuracyRunId}\n`); const storage = await getAccuracySnapshotStorage(); - const snapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); - await storage.close(); + const currentSnapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); - if (snapshot.length === 0) { - console.log("No snapshots found for the current run."); + if (currentSnapshot.length === 0) { + console.log("No snapshot entries found for the current run."); + await storage.close(); return; } - const htmlReport = await generateHtmlReport(snapshot, accuracyRunId); + let snapshotWithBaseline: SnapshotEntryWithBaseline[] = currentSnapshot; + let baselineInfo: { commitSHA: string; accuracyRunId: string; createdOn: string } | undefined; + + if (baselineCommitSHA) { + console.log(`šŸ” Fetching baseline snapshot entries for commit: ${baselineCommitSHA}`); + const baselineSnapshot = await storage.getLatestSnapshotForCommit(baselineCommitSHA); + + if (baselineSnapshot.length > 0) { + console.log(`āœ… Found ${baselineSnapshot.length} baseline snapshot entries.`); + snapshotWithBaseline = compareSnapshotEntries(currentSnapshot, baselineSnapshot); + + const firstBaselineSnapshot = baselineSnapshot[0]; + if (firstBaselineSnapshot) { + baselineInfo = { + commitSHA: firstBaselineSnapshot.commitSHA, + accuracyRunId: firstBaselineSnapshot.accuracyRunId, + createdOn: firstBaselineSnapshot.createdOn + ? new Date(firstBaselineSnapshot.createdOn).toLocaleString() + : "unknown", + }; + } + } else { + console.log(`āš ļø No baseline snapshots found for commit: ${baselineCommitSHA}`); + } + } + + const htmlReport = await generateHtmlReport(snapshotWithBaseline, accuracyRunId, baselineInfo); + await storage.close(); const reportPath = HTML_TESTS_SUMMARY_FILE; await writeFile(reportPath, htmlReport, "utf8"); console.log(`āœ… HTML report generated: ${reportPath}`); - const totalPrompts = snapshot.length; - const modelsCount = new Set(snapshot.map((s) => `${s.provider} ${s.requestedModel}`)).size; - const testsWithZeroAccuracy = snapshot.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + const totalPrompts = snapshotWithBaseline.length; + const modelsCount = new Set(snapshotWithBaseline.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotWithBaseline.filter( + (snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0 + ); + const evalsImproved = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "regressed").length; console.log(`\nšŸ“ˆ Summary:`); console.log(` Total prompts evaluated: ${totalPrompts}`); console.log(` Models tested: ${modelsCount}`); console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); - console.log(` Report saved to: ${reportPath}\n`); + + if (baselineCommitSHA) { + console.log(` Baseline commit: ${baselineCommitSHA}`); + console.log(` Evals improved vs baseline: ${evalsImproved}`); + console.log(` Evals regressed vs baseline: ${evalsRegressed}`); + } } catch (error) { console.error("Error generating test summary:", error); process.exit(1); diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts index d3b1b56a..960daffc 100644 --- a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -61,7 +61,7 @@ export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { private async getLatestAccuracyRunForCommit(commit: string): Promise { const document = await this.snapshotCollection.findOne( - { commit: commit, accuracyRunStatus: AccuracyRunStatus.Done }, + { commitSHA: commit, accuracyRunStatus: AccuracyRunStatus.Done }, { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } ); From 055628d30960523278f6641573fe7a82516a8661 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:05:59 +0200 Subject: [PATCH 44/47] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 640fdd1a..f60f416f 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -16,7 +16,7 @@ jobs: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') env: - MDB_OPEN_AI_API_KEY: ${{ secrets.MDB_OPEN_AI_API_KEY }} + MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} From 1933f1f66c9bf31f2dd5677b49a3e75443923dfd Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:10 +0200 Subject: [PATCH 45/47] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index f60f416f..b7b296f0 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -22,7 +22,7 @@ jobs: MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} - MDB_ACCURACY_MDB_COLLECTION: ${{ secrets.MDB_ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: - uses: GitHubSecurityLab/actions-permissions/monitor@v1 From 5c97ca8aeb709078d4a47b8bed32bc432996ac4f Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:17 +0200 Subject: [PATCH 46/47] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index b7b296f0..6f99eab7 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -21,7 +21,7 @@ jobs: MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} - MDB_ACCURACY_MDB_DB: ${{ secrets.MDB_ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} steps: From f66601400d3ab7c767b3341ed1d31b63d333fc56 Mon Sep 17 00:00:00 2001 From: Himanshu Singh Date: Fri, 11 Jul 2025 15:06:35 +0200 Subject: [PATCH 47/47] Update .github/workflows/accuracy-tests.yml Co-authored-by: Nikola Irinchev --- .github/workflows/accuracy-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml index 6f99eab7..75dac32c 100644 --- a/.github/workflows/accuracy-tests.yml +++ b/.github/workflows/accuracy-tests.yml @@ -20,7 +20,7 @@ jobs: MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} - MDB_ACCURACY_MDB_URL: ${{ secrets.MDB_ACCURACY_MDB_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }}