diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml new file mode 100644 index 00000000..75dac32c --- /dev/null +++ b/.github/workflows/accuracy-tests.yml @@ -0,0 +1,48 @@ +name: Accuracy Tests + +on: + workflow_dispatch: + pull_request: + types: [labeled] + +jobs: + run-accuracy-tests: + name: Run Accuracy Tests + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') + env: + MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} + MDB_GEMINI_API_KEY: ${{ secrets.MDB_GEMINI_API_KEY }} + MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.MDB_AZURE_OPEN_AI_API_KEY }} + MDB_AZURE_OPEN_AI_API_URL: ${{ secrets.MDB_AZURE_OPEN_AI_API_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} + MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} + steps: + - uses: GitHubSecurityLab/actions-permissions/monitor@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version-file: package.json + cache: "npm" + - name: Install dependencies + run: npm ci + - name: Run accuracy tests + run: ./scripts/run-accuracy-tests.sh + - name: Upload accuracy test summary + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-test-summary + path: .accuracy/tests-summary.html + - name: Comment summary on PR + if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' + uses: marocchino/sticky-pull-request-comment@v2 + with: + path: .accuracy/tests-summary.html diff --git a/.gitignore b/.gitignore index 4e3f7a54..49550e27 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ state.json tests/tmp coverage +# Generated assets by accuracy runs +.accuracy diff --git a/package-lock.json b/package-lock.json index 29132ba3..a3bf47c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -30,7 +30,11 @@ "mongodb-mcp-server": "dist/index.js" }, "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@jest/globals": "^30.0.4", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", @@ -38,6 +42,7 @@ "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-jest": "^29.0.1", @@ -46,20 +51,153 @@ "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jest-extended": "^6.0.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "simple-git": "^3.28.0", "ts-jest": "^29.4.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "engines": { "node": ">=20.10.0" } }, + "@himanshusinghs/ai-sdk-google": { + "extraneous": true + }, + "node_modules/@ai-sdk/anthropic": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-1.2.12.tgz", + "integrity": "sha512-YSzjlko7JvuiyQFmI9RN1tNZdEiZxc+6xld/0tq/VkJaHpEzGAb1yiNxxvmYVcjvfu/PcvCxAAYXmTYQQ63IHQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/azure": { + "version": "1.3.23", + "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.23.tgz", + "integrity": "sha512-vpsaPtU24RBVk/IMM5UylR/N4RtAuL2NZLWc7LJ3tvMTHu6pI46a7w+1qIwR3F6yO9ehWR8qvfLaBefJNFxaVw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai": "1.3.22", + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/openai": { + "version": "1.3.22", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.22.tgz", + "integrity": "sha512-QwA+2EkG0QyjVR+7h6FE7iOu2ivNqAVMm9UJZkVxxTk5OIq5fFJDTEI/zICEMuHImTTXR2JjsL6EirJ28Jc4cw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/provider": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", + "integrity": "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/provider-utils": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz", + "integrity": "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/react": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.12.tgz", + "integrity": "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/ui-utils": "1.2.11", + "swr": "^2.2.5", + "throttleit": "2.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@ai-sdk/ui-utils": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz", + "integrity": "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "zod-to-json-schema": "^3.24.1" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", @@ -2090,6 +2228,54 @@ "@hapi/hoek": "^11.0.2" } }, + "node_modules/@himanshusinghs/google": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@himanshusinghs/google/-/google-1.2.11.tgz", + "integrity": "sha512-SKTFxwN9PpUHVrppFod8sF1jqys5azzsgcBVrSbc7VaazmVEnBxHQlv5/yfeZFjD3ly5Mw+AJdFfC0bxwdWBNg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "@ai-sdk/provider-utils": "2.2.6" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.2.tgz", + "integrity": "sha512-ITdgNilJZwLKR7X5TnUr1BsQW6UTX5yFp0h66Nfx8XjBYkWD9W3yugr50GOz3CnE9m/U/Cd5OyEbTMI0rgi6ZQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@himanshusinghs/google/node_modules/@ai-sdk/provider-utils": { + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.6.tgz", + "integrity": "sha512-sUlZ7Gnq84DCGWMQRIK8XVbkzIBnvPR1diV4v6JwPgpn5armnLI/j+rqn62MpLrU5ZCQZlDKl/Lw6ed3ulYqaA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.2", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@humanfs/core": { "version": "0.19.1", "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", @@ -2929,6 +3115,23 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@kwsites/file-exists": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/file-exists/-/file-exists-1.1.1.tgz", + "integrity": "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^4.1.1" + } + }, + "node_modules/@kwsites/promise-deferred": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/promise-deferred/-/promise-deferred-1.1.1.tgz", + "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", + "dev": true, + "license": "MIT" + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.0", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.0.tgz", @@ -5424,6 +5627,19 @@ "node": ">=18.0.0" } }, + "node_modules/@smithy/middleware-retry/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@smithy/middleware-serde": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.0.3.tgz", @@ -5906,6 +6122,13 @@ "@babel/types": "^7.20.7" } }, + "node_modules/@types/diff-match-patch": { + "version": "1.0.36", + "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz", + "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/estree": { "version": "1.0.7", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.7.tgz", @@ -6660,6 +6883,33 @@ "node": ">= 14" } }, + "node_modules/ai": { + "version": "4.3.16", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.16.tgz", + "integrity": "sha512-KUDwlThJ5tr2Vw0A1ZkbDKNME3wzWhuVfAOwIvFUzl1TPVDFAXDFTXio3p+jaKneB+dKNCvFFlolYmmgHttG1g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/react": "1.2.12", + "@ai-sdk/ui-utils": "1.2.11", + "@opentelemetry/api": "1.9.0", + "jsondiffpatch": "0.6.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + } + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -8376,6 +8626,16 @@ "node": ">= 0.8" } }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/destroy": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", @@ -8423,6 +8683,13 @@ "node": ">=0.3.1" } }, + "node_modules/diff-match-patch": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz", + "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/diff-sequences": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", @@ -11803,6 +12070,13 @@ "foreach": "^2.0.4" } }, + "node_modules/json-schema": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", + "dev": true, + "license": "(AFL-2.1 OR BSD-3-Clause)" + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -11830,6 +12104,37 @@ "node": ">=6" } }, + "node_modules/jsondiffpatch": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz", + "integrity": "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/diff-match-patch": "^1.0.36", + "chalk": "^5.3.0", + "diff-match-patch": "^1.0.5" + }, + "bin": { + "jsondiffpatch": "bin/jsondiffpatch.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + } + }, + "node_modules/jsondiffpatch/node_modules/chalk": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", + "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/jsonpath-plus": { "version": "10.3.0", "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", @@ -12132,6 +12437,13 @@ "node": ">= 0.6" } }, + "node_modules/microdiff": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/microdiff/-/microdiff-1.5.0.tgz", + "integrity": "sha512-Drq+/THMvDdzRYrK0oxJmOKiC24ayUV8ahrt8l3oRK51PWt6gdtrIGrlIH3pT/lFh1z93FbAcidtsHcWbnRz8Q==", + "dev": true, + "license": "MIT" + }, "node_modules/micromatch": { "version": "4.0.8", "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", @@ -13011,6 +13323,29 @@ "node": "^10.13.0 || >=12.0.0" } }, + "node_modules/ollama-ai-provider": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/ollama-ai-provider/-/ollama-ai-provider-1.2.0.tgz", + "integrity": "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "^1.0.0", + "@ai-sdk/provider-utils": "^2.0.0", + "partial-json": "0.1.7" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, "node_modules/on-finished": { "version": "2.4.1", "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", @@ -13388,6 +13723,13 @@ "node": ">= 0.8" } }, + "node_modules/partial-json": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz", + "integrity": "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==", + "dev": true, + "license": "MIT" + }, "node_modules/path-browserify": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", @@ -14402,6 +14744,13 @@ "loose-envify": "^1.1.0" } }, + "node_modules/secure-json-parse": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", + "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/seek-bzip": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/seek-bzip/-/seek-bzip-1.0.6.tgz", @@ -14830,6 +15179,22 @@ "simple-concat": "^1.0.0" } }, + "node_modules/simple-git": { + "version": "3.28.0", + "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.28.0.tgz", + "integrity": "sha512-Rs/vQRwsn1ILH1oBUy8NucJlXmnnLeLCfcvbSehkPzbv3wwoFWIdtfd6Ndo6ZPhlPsCZ60CPI4rxurnwAa+a2w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.4.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/steveukx/git-js?sponsor=1" + } + }, "node_modules/simple-oauth2": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/simple-oauth2/-/simple-oauth2-5.1.0.tgz", @@ -15351,6 +15716,20 @@ "node": ">= 6" } }, + "node_modules/swr": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.3.tgz", + "integrity": "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "dequal": "^2.0.3", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, "node_modules/synckit": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", @@ -15571,6 +15950,19 @@ "node": "*" } }, + "node_modules/throttleit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz", + "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -16163,16 +16555,17 @@ } }, "node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz", + "integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" ], "license": "MIT", "bin": { - "uuid": "dist/bin/uuid" + "uuid": "dist/esm/bin/uuid" } }, "node_modules/v8-compile-cache-lib": { diff --git a/package.json b/package.json index 53d6d2c6..53639aec 100644 --- a/package.json +++ b/package.json @@ -29,11 +29,17 @@ "check:types": "tsc --noEmit --project tsconfig.json", "reformat": "prettier --write .", "generate": "./scripts/generate.sh", - "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage" + "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --coverage --testPathIgnorePatterns=/tests/accuracy/", + "pre:test:accuracy": "npm run build:compile", + "test:accuracy": "sh ./scripts/run-accuracy-tests.sh" }, "license": "Apache-2.0", "devDependencies": { + "@ai-sdk/anthropic": "^1.2.12", + "@ai-sdk/azure": "^1.3.23", + "@ai-sdk/openai": "^1.3.22", "@eslint/js": "^9.30.1", + "@himanshusinghs/google": "^1.2.11", "@jest/globals": "^30.0.4", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", @@ -41,6 +47,7 @@ "@types/node": "^24.0.12", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", + "ai": "^4.3.16", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-jest": "^29.0.1", @@ -49,14 +56,18 @@ "jest": "^30.0.4", "jest-environment-node": "^30.0.4", "jest-extended": "^6.0.0", + "microdiff": "^1.5.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "simple-git": "^3.28.0", "ts-jest": "^29.4.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "dependencies": { diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html new file mode 100644 index 00000000..903457f8 --- /dev/null +++ b/resources/test-summary-template.html @@ -0,0 +1,407 @@ + + + + + + MongoDB MCP Server - Accuracy Test Summary + + + +
+

šŸ“Š MongoDB MCP Server - Accuracy Test Summary

+
+

šŸ“Š Current Run Information

+
+
+
Accuracy Run ID
+
{{accuracyRunId}}
+
+
+
Commit SHA
+
{{commitSHA}}
+
+
+
Run Created On
+
{{createdOn}}
+
+
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+
+ +
+

šŸ“ˆ Test Results Summary

+
+
+
Total Prompts Evaluated
+
{{totalTests}}
+
+
+
Models Tested
+
{{modelsCount}}
+
+
+
Evals with 0% Accuracy
+
{{testsWithZeroAccuracy}}
+
+
+
Average Accuracy
+
{{averageAccuracy}}
+
+
+
+ +
+

šŸ”„ Baseline Comparison

+
+
+
Baseline Accuracy Run ID
+
{{baselineAccuracyRunId}}
+
+
+
Baseline Commit SHA
+
{{baselineCommitSHA}}
+
+
+
Baseline Run Created On
+
{{baselineCreatedOn}}
+
+
+
Evals Improved vs Baseline
+
{{evalsImproved}}
+
+
+
Evals Regressed vs Baseline
+
{{evalsRegressed}}
+
+
+
+ + + + + + + + + + + + + + + {{tableRows}} + +
PromptModelExpected Tool CallsLLM Tool CallsAccuracyBaseline AccuracyLLM Response Time (ms)Total Tokens Used
+
+ + + diff --git a/scripts/generate-test-summary.ts b/scripts/generate-test-summary.ts new file mode 100644 index 00000000..fba40610 --- /dev/null +++ b/scripts/generate-test-summary.ts @@ -0,0 +1,288 @@ +import { readFile, writeFile } from "fs/promises"; +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { HTML_TESTS_SUMMARY_FILE, HTML_TESTS_SUMMARY_TEMPLATE } from "../tests/accuracy/sdk/constants.js"; +import type { + AccuracySnapshotEntry, + ExpectedToolCall, + LLMToolCall, +} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +interface BaselineComparison { + baselineAccuracy?: number; + comparisonResult?: "improved" | "regressed" | "same"; +} + +interface SnapshotEntryWithBaseline extends AccuracySnapshotEntry { + baseline?: BaselineComparison; +} + +function populateTemplate(template: string, data: Record): string { + return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); +} + +function formatAccuracy(accuracy: number): string { + return (accuracy * 100).toFixed(1) + "%"; +} + +function getAccuracyClass(accuracy: number): string { + if (accuracy === 1) return "accuracy-perfect"; + if (accuracy >= 0.75) return "accuracy-good"; + return "accuracy-poor"; +} + +function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string { + return toolCalls + .map((call) => { + const params = JSON.stringify(call.parameters, null, 2); + return `${call.toolName}`; + }) + .join(", "); +} + +function formatTokenUsage(tokensUsage: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +}): string { + const total = tokensUsage.totalTokens || 0; + const prompt = tokensUsage.promptTokens || 0; + const completion = tokensUsage.completionTokens || 0; + + const tooltip = `Prompt: ${prompt}\nCompletion: ${completion}\nTotal: ${total}`; + return `${total}`; +} + +function formatMessages(messages: Array>): string { + return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); +} + +function formatBaselineAccuracy(snapshot: SnapshotEntryWithBaseline): string { + if (!snapshot.baseline || snapshot.baseline.baselineAccuracy === undefined) { + return 'N/A'; + } + + const baselineAccuracyText = formatAccuracy(snapshot.baseline.baselineAccuracy); + let comparisonClass = "accuracy-comparison"; + let comparisonIcon = ""; + + if (snapshot.baseline.comparisonResult) { + switch (snapshot.baseline.comparisonResult) { + case "improved": + comparisonClass += " accuracy-improved"; + comparisonIcon = " ↗"; + break; + case "regressed": + comparisonClass += " accuracy-regressed"; + comparisonIcon = " ā†˜"; + break; + case "same": + comparisonClass += " accuracy-same"; + comparisonIcon = " →"; + break; + } + } + + return `${baselineAccuracyText}${comparisonIcon}`; +} + +function compareSnapshotEntries( + currentSnapshotEntries: AccuracySnapshotEntry[], + baselineSnapshotEntries: AccuracySnapshotEntry[] +): SnapshotEntryWithBaseline[] { + const baselineMap = new Map(); + baselineSnapshotEntries.forEach((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + baselineMap.set(key, entry); + }); + + return currentSnapshotEntries.map((entry) => { + const key = `${entry.provider}|${entry.requestedModel}|${entry.prompt}`; + const baselineEntry = baselineMap.get(key); + + if (!baselineEntry) { + return entry; + } + + let comparisonResult: "improved" | "regressed" | "same"; + if (entry.toolCallingAccuracy > baselineEntry.toolCallingAccuracy) { + comparisonResult = "improved"; + } else if (entry.toolCallingAccuracy < baselineEntry.toolCallingAccuracy) { + comparisonResult = "regressed"; + } else { + comparisonResult = "same"; + } + + return { + ...entry, + baseline: { + baselineAccuracy: baselineEntry.toolCallingAccuracy, + comparisonResult, + }, + }; + }); +} + +async function generateHtmlReport( + snapshotEntries: SnapshotEntryWithBaseline[], + accuracyRunId: string, + baselineInfo?: { + commitSHA: string; + accuracyRunId: string; + createdOn: string; + } +): Promise { + const totalPrompts = snapshotEntries.length; + const modelsCount = new Set(snapshotEntries.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotEntries.filter((snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0); + + const totalAccuracy = snapshotEntries.reduce((sum, entry) => sum + entry.toolCallingAccuracy, 0); + const averageAccuracy = totalPrompts > 0 ? totalAccuracy / totalPrompts : 0; + + const evalsImproved = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotEntries.filter((s) => s.baseline?.comparisonResult === "regressed").length; + + const firstSnapshotEntry = snapshotEntries[0]; + const runStatus = firstSnapshotEntry?.accuracyRunStatus || "unknown"; + const commitSHA = firstSnapshotEntry?.commitSHA || "unknown"; + const createdOn = firstSnapshotEntry?.createdOn + ? new Date(firstSnapshotEntry.createdOn).toLocaleString() + : "unknown"; + const reportGeneratedOn = new Date().toLocaleString(); + + const tableRows = snapshotEntries + .map( + (snapshotEntry, index) => ` + + + ā–¶ + ${snapshotEntry.prompt} + + ${snapshotEntry.provider} - ${snapshotEntry.requestedModel} + ${formatToolCallsWithTooltip(snapshotEntry.expectedToolCalls)} + ${formatToolCallsWithTooltip(snapshotEntry.actualToolCalls)} + + + ${formatAccuracy(snapshotEntry.toolCallingAccuracy)} + + + ${formatBaselineAccuracy(snapshotEntry)} + ${snapshotEntry.llmResponseTime.toFixed(2)} + ${formatTokenUsage(snapshotEntry.tokensUsage || {})} + + + +
+
+

šŸ¤– LLM Response

+
${snapshotEntry.text}
+
+
+

šŸ’¬ Conversation Messages

+
${formatMessages(snapshotEntry.messages)}
+
+
+ + + ` + ) + .join(""); + + const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); + return populateTemplate(template, { + accuracyRunId, + runStatus, + runStatusUpper: runStatus.toUpperCase(), + commitSHA, + reportGeneratedOn, + createdOn, + totalTests: String(totalPrompts), + modelsCount: String(modelsCount), + testsWithZeroAccuracy: String(testsWithZeroAccuracy.length), + averageAccuracy: formatAccuracy(averageAccuracy), + baselineCommitSHA: baselineInfo?.commitSHA || "N/A", + baselineAccuracyRunId: baselineInfo?.accuracyRunId || "N/A", + baselineCreatedOn: baselineInfo?.createdOn || "N/A", + evalsImproved: String(evalsImproved), + evalsRegressed: String(evalsRegressed), + tableRows, + }); +} + +async function generateTestSummary(): Promise { + try { + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + const baselineCommitSHA = process.env.MDB_ACCURACY_BASELINE_COMMIT; + + if (!accuracyRunId) { + throw new Error("Cannot generate test summary, accuracy run id is unknown"); + } + console.log(`\nšŸ“Š Generating test summary for accuracy run: ${accuracyRunId}\n`); + + const storage = await getAccuracySnapshotStorage(); + const currentSnapshot = await storage.getSnapshotForAccuracyRun(accuracyRunId); + + if (currentSnapshot.length === 0) { + console.log("No snapshot entries found for the current run."); + await storage.close(); + return; + } + + let snapshotWithBaseline: SnapshotEntryWithBaseline[] = currentSnapshot; + let baselineInfo: { commitSHA: string; accuracyRunId: string; createdOn: string } | undefined; + + if (baselineCommitSHA) { + console.log(`šŸ” Fetching baseline snapshot entries for commit: ${baselineCommitSHA}`); + const baselineSnapshot = await storage.getLatestSnapshotForCommit(baselineCommitSHA); + + if (baselineSnapshot.length > 0) { + console.log(`āœ… Found ${baselineSnapshot.length} baseline snapshot entries.`); + snapshotWithBaseline = compareSnapshotEntries(currentSnapshot, baselineSnapshot); + + const firstBaselineSnapshot = baselineSnapshot[0]; + if (firstBaselineSnapshot) { + baselineInfo = { + commitSHA: firstBaselineSnapshot.commitSHA, + accuracyRunId: firstBaselineSnapshot.accuracyRunId, + createdOn: firstBaselineSnapshot.createdOn + ? new Date(firstBaselineSnapshot.createdOn).toLocaleString() + : "unknown", + }; + } + } else { + console.log(`āš ļø No baseline snapshots found for commit: ${baselineCommitSHA}`); + } + } + + const htmlReport = await generateHtmlReport(snapshotWithBaseline, accuracyRunId, baselineInfo); + await storage.close(); + + const reportPath = HTML_TESTS_SUMMARY_FILE; + await writeFile(reportPath, htmlReport, "utf8"); + + console.log(`āœ… HTML report generated: ${reportPath}`); + + const totalPrompts = snapshotWithBaseline.length; + const modelsCount = new Set(snapshotWithBaseline.map((s) => `${s.provider} ${s.requestedModel}`)).size; + const testsWithZeroAccuracy = snapshotWithBaseline.filter( + (snapshotEntry) => snapshotEntry.toolCallingAccuracy === 0 + ); + const evalsImproved = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "improved").length; + const evalsRegressed = snapshotWithBaseline.filter((s) => s.baseline?.comparisonResult === "regressed").length; + + console.log(`\nšŸ“ˆ Summary:`); + console.log(` Total prompts evaluated: ${totalPrompts}`); + console.log(` Models tested: ${modelsCount}`); + console.log(` Evals with 0% accuracy: ${testsWithZeroAccuracy.length}`); + + if (baselineCommitSHA) { + console.log(` Baseline commit: ${baselineCommitSHA}`); + console.log(` Evals improved vs baseline: ${evalsImproved}`); + console.log(` Evals regressed vs baseline: ${evalsRegressed}`); + } + } catch (error) { + console.error("Error generating test summary:", error); + process.exit(1); + } +} + +void generateTestSummary(); diff --git a/scripts/run-accuracy-tests.sh b/scripts/run-accuracy-tests.sh new file mode 100644 index 00000000..ae02dd06 --- /dev/null +++ b/scripts/run-accuracy-tests.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# Variables necessary for the accuracy test runs +export MDB_ACCURACY_RUN_ID=$(npx uuid v4) + +# For providing access tokens for different LLM providers +# export MDB_OPEN_AI_API_KEY="" +# export MDB_GEMINI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_URL="" + +# For providing a mongodb based storage to store accuracy snapshots +# export MDB_ACCURACY_MDB_URL="" +# export MDB_ACCURACY_MDB_DB="" +# export MDB_ACCURACY_MDB_COLLECTION="" + +# By default we run all the tests under tests/accuracy folder unless a path is +# specified in the command line. Such as: +# npm run test:accuracy -- tests/accuracy/some-test.test.ts +TEST_PATH_PATTERN="${1:-tests/accuracy}" +shift || true +node --experimental-vm-modules node_modules/jest/bin/jest.js --bail --testPathPatterns "$TEST_PATH_PATTERN" "$@" + +# Preserving the exit code from test run to correctly notify in the CI +# environments when the tests fail. +JEST_EXIT_CODE=$? + +# Each test run submits an accuracy snapshot entry with the accuracyRunStatus: +# "in-progress". When all the tests are done and jest exits with an exit code of +# 0, we can safely mark accuracy run as finished otherwise failed. + +# This "outside-the-tests-status-update" is arising out of the fact that each +# test suite stores their own accuracy run data in the storage and this setup +# might lead to data inconsistency when the tests fail. To overcome that each +# accuracy snapshot entry has a status which by default is "in-progress" and is +# updated when the tests either pass (all our accuracy tests are supposed to +# pass unless some errors occurs during the test runs), or fail. + +# This is necessary when comparing one accuracy run with another as we wouldn't +# want to compare against an incomplete run. +if [ $JEST_EXIT_CODE -eq 0 ]; then + MDB_ACCURACY_RUN_STATUS="done" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'done'" + npx tsx scripts/generate-test-summary.ts || echo "Warning: Failed to generate test summary HTML report" +else + MDB_ACCURACY_RUN_STATUS="failed" npx tsx scripts/update-accuracy-run-status.ts || echo "Warning: Failed to update accuracy run status to 'failed'" +fi + + +exit $JEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/update-accuracy-run-status.ts b/scripts/update-accuracy-run-status.ts new file mode 100644 index 00000000..6d8e3895 --- /dev/null +++ b/scripts/update-accuracy-run-status.ts @@ -0,0 +1,18 @@ +import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js"; +import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; +const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; + +if ( + !envAccuracyRunId || + (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) +) { + process.exit(1); +} + +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); +const storage = await getAccuracySnapshotStorage(); +await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts new file mode 100644 index 00000000..30a5a0e3 --- /dev/null +++ b/tests/accuracy/aggregate.test.ts @@ -0,0 +1,16 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; + +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", + expectedToolCalls: [ + { + toolName: "aggregate", + parameters: { + pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } }, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/collection-indexes.test.ts b/tests/accuracy/collection-indexes.test.ts new file mode 100644 index 00000000..dab7d317 --- /dev/null +++ b/tests/accuracy/collection-indexes.test.ts @@ -0,0 +1,26 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCollectionIndexes(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"), + callsCollectionIndexes("List all the indexes in movies collection in mflix database"), + callsCollectionIndexes( + `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?` + ), +]); diff --git a/tests/accuracy/collection-schema.test.ts b/tests/accuracy/collection-schema.test.ts new file mode 100644 index 00000000..f2f22a88 --- /dev/null +++ b/tests/accuracy/collection-schema.test.ts @@ -0,0 +1,23 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCollectionSchema(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "db1", + collection: "coll1", + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"), + callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"), +]); diff --git a/tests/accuracy/collection-storage-size.test.ts b/tests/accuracy/collection-storage-size.test.ts new file mode 100644 index 00000000..2bd2f021 --- /dev/null +++ b/tests/accuracy/collection-storage-size.test.ts @@ -0,0 +1,42 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; + +describeAccuracyTests(getAvailableModels(), [ + { + prompt: "What is the size of 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "collection-storage-size", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "How much size is each collection in comics database", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "books", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "characters", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts new file mode 100644 index 00000000..09db4678 --- /dev/null +++ b/tests/accuracy/count.test.ts @@ -0,0 +1,54 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCountToolWithQuery( + prompt: string, + database = "mflix", + collection = "movies", + query: Record = {} +): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database, + collection, + query, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."), + callsCountToolWithEmptyQuery( + "How many documents are there in 'characters' collection in 'comics' database?", + "comics", + "characters" + ), + callsCountToolWithQuery( + "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + "mflix", + "movies", + { runtime: { $lt: 100 } } + ), +]); diff --git a/tests/accuracy/create-collection.test.ts b/tests/accuracy/create-collection.test.ts new file mode 100644 index 00000000..db7f888c --- /dev/null +++ b/tests/accuracy/create-collection.test.ts @@ -0,0 +1,51 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsCreateCollectionWithListCollections(prompt: string, expectedToolCalls: ExpectedToolCall[]) { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"), + callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"), + callsCreateCollectionWithListCollections( + "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + [ + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", + }, + }, + ] + ), +]); diff --git a/tests/accuracy/create-index.test.ts b/tests/accuracy/create-index.test.ts new file mode 100644 index 00000000..6dae12e5 --- /dev/null +++ b/tests/accuracy/create-index.test.ts @@ -0,0 +1,31 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsCreateIndex(prompt: string, indexKeys: Record): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + keys: indexKeys, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsCreateIndex( + "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + { + release_year: 1, + } + ), + callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", { + title: "text", + }), +]); diff --git a/tests/accuracy/db-stats.test.ts b/tests/accuracy/db-stats.test.ts new file mode 100644 index 00000000..656eccc2 --- /dev/null +++ b/tests/accuracy/db-stats.test.ts @@ -0,0 +1,19 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "db-stats", + parameters: { + database, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]); diff --git a/tests/accuracy/delete-many.test.ts b/tests/accuracy/delete-many.test.ts new file mode 100644 index 00000000..c0dd4d51 --- /dev/null +++ b/tests/accuracy/delete-many.test.ts @@ -0,0 +1,40 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsDeleteManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }; +} + +function callsDeleteManyWithFilters(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsDeleteManyWithEmptyFilters("Delete all the documents from 'mflix.movies' namespace"), + callsDeleteManyWithEmptyFilters("Purge the collection 'movies' in database 'mflix'"), + callsDeleteManyWithFilters("Remove all the documents from namespace 'mflix.movies' where runtime is less than 100"), +]); diff --git a/tests/accuracy/drop-collection.test.ts b/tests/accuracy/drop-collection.test.ts new file mode 100644 index 00000000..98ba3348 --- /dev/null +++ b/tests/accuracy/drop-collection.test.ts @@ -0,0 +1,74 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function onlyCallsDropCollection(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }; +} + +function callsDropCollection(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropCollection("Remove mflix.movies namespace from my cluster."), + onlyCallsDropCollection("Drop movies collection from mflix database."), + callsDropCollection("Remove books collection from which ever database contains it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", + }, + }, + ]), +]); diff --git a/tests/accuracy/drop-database.test.ts b/tests/accuracy/drop-database.test.ts new file mode 100644 index 00000000..53fc7fd5 --- /dev/null +++ b/tests/accuracy/drop-database.test.ts @@ -0,0 +1,42 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function onlyCallsDropDatabase(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }; +} + +function callsDropDatabase(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls, + }; +} + +describeAccuracyTests(getAvailableModels(), [ + onlyCallsDropDatabase("Remove mflix database from my cluster."), + onlyCallsDropDatabase("Drop database named mflix."), + callsDropDatabase("If there is a mflix database in my cluster then drop it.", [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ]), +]); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts new file mode 100644 index 00000000..4a539c48 --- /dev/null +++ b/tests/accuracy/explain.test.ts @@ -0,0 +1,64 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsExplain(prompt: string, method: Record): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [method], + }, + }, + ], + }; +} + +const callsExplainWithFind = (prompt: string) => + callsExplain(prompt, { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, + }); + +const callsExplainWithAggregate = (prompt: string) => + callsExplain(prompt, { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, + }); + +const callsExplainWithCount = (prompt: string) => + callsExplain(prompt, { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, + }); + +/** + * None of these tests score a parameter match on any of the models, likely + * because we are using Zod.union, when we probably should've used + * Zod.discriminatedUnion + */ +describeAccuracyTests(getAvailableModels(), [ + callsExplainWithFind( + `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithAggregate( + `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), + callsExplainWithCount( + `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?` + ), +]); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts new file mode 100644 index 00000000..02c02cd1 --- /dev/null +++ b/tests/accuracy/find.test.ts @@ -0,0 +1,117 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsFindNoFilter(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database, + collection, + }, + }, + ], + }; +} + +function callsFindWithFilter(prompt: string, filter: Record): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: filter, + }, + }, + ], + }; +} + +function callsFindWithProjection(prompt: string, projection: Record): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + projection, + }, + }, + ], + }; +} + +function callsFindWithProjectionAndFilters( + prompt: string, + filter: Record, + projection: Record +): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter, + projection, + }, + }, + ], + }; +} + +function callsFindWithFilterSortAndLimit( + prompt: string, + filter: Record, + sort: Record, + limit: number +): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter, + sort, + limit, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsFindNoFilter("List all the movies in 'mflix.movies' namespace."), + callsFindNoFilter("List all the documents in 'comics.books' namespace.", "comics", "books"), + callsFindWithFilter("Find all the movies in 'mflix.movies' namespace with runtime less than 100.", { + runtime: { $lt: 100 }, + }), + callsFindWithFilter("Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", { + director: "Christina Collins", + }), + callsFindWithProjection("Give me all the movie titles available in 'mflix.movies' namespace", { title: 1 }), + callsFindWithProjectionAndFilters( + "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + { title: "Certain Fish" }, + { cast: 1 } + ), + callsFindWithFilterSortAndLimit( + "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + { genres: "Horror" }, + { runtime: 1 }, + 2 + ), +]); diff --git a/tests/accuracy/insert-many.test.ts b/tests/accuracy/insert-many.test.ts new file mode 100644 index 00000000..4ce15bb8 --- /dev/null +++ b/tests/accuracy/insert-many.test.ts @@ -0,0 +1,59 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsInsertMany(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + id: 1, + title: "name1", + }, + { + id: 2, + title: "name2", + }, + { + id: 3, + title: "name3", + }, + ], + }, + }, + ], + }; +} + +function callsEmptyInsertMany(prompt: string) { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [{}, {}, {}], + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsInsertMany( + [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n") + ), + callsEmptyInsertMany("Add three empty documents in collection 'movies' inside database 'mflix'"), +]); diff --git a/tests/accuracy/list-collections.test.ts b/tests/accuracy/list-collections.test.ts new file mode 100644 index 00000000..78a14f34 --- /dev/null +++ b/tests/accuracy/list-collections.test.ts @@ -0,0 +1,56 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsListCollections(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }; +} + +function callsListDatabasesAndListCollections(prompt: string): AccuracyTestConfig { + return { + injectConnectedAssumption: true, + prompt: prompt, + mockedTools: {}, + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { database: "admin" }, + }, + { + toolName: "list-collections", + parameters: { database: "comics" }, + }, + { + toolName: "list-collections", + parameters: { database: "config" }, + }, + { + toolName: "list-collections", + parameters: { database: "local" }, + }, + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsListCollections("How many collections do I have in database mflix?"), + callsListCollections("List all the collections in my MongoDB database mflix."), + callsListCollections("Is there a shows collection in my MongoDB database mflix?"), + callsListDatabasesAndListCollections("List all the collections that I have in total on my cluster?"), +]); diff --git a/tests/accuracy/list-databases.test.ts b/tests/accuracy/list-databases.test.ts new file mode 100644 index 00000000..97a8ce27 --- /dev/null +++ b/tests/accuracy/list-databases.test.ts @@ -0,0 +1,21 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsListDatabases(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsListDatabases("How many databases do I have?"), + callsListDatabases("List all the databases that I have in my clusters"), + callsListDatabases("Is there a mflix database in my cluster?"), +]); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts new file mode 100644 index 00000000..8b9d2193 --- /dev/null +++ b/tests/accuracy/logs.test.ts @@ -0,0 +1,27 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; +import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +function callsLogsTool(prompt: string, toolCall: ExpectedToolCall): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [toolCall], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsLogsTool("Were there any startup warnings for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }), + callsLogsTool("Retrieve first 10 logs for my MongoDB server?", { + toolName: "mongodb-logs", + parameters: { + type: "global", + limit: 10, + }, + }), +]); diff --git a/tests/accuracy/rename-collection.test.ts b/tests/accuracy/rename-collection.test.ts new file mode 100644 index 00000000..549a02b9 --- /dev/null +++ b/tests/accuracy/rename-collection.test.ts @@ -0,0 +1,43 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsRenameCollection(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + }, + }, + ], + }; +} + +function callsRenameCollectionWithDropTarget(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + dropTarget: true, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsRenameCollection("Rename my 'mflix.movies' namespace to 'mflix.new_movies'"), + callsRenameCollectionWithDropTarget( + "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace." + ), +]); diff --git a/tests/accuracy/sdk/accuracy-scorer.ts b/tests/accuracy/sdk/accuracy-scorer.ts new file mode 100644 index 00000000..2ae13e6c --- /dev/null +++ b/tests/accuracy/sdk/accuracy-scorer.ts @@ -0,0 +1,114 @@ +import diff from "microdiff"; +import { ExpectedToolCall, LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; + +/** + * Tool calling accuracy is a single number calculated based on two dimensions. + * 1. Did LLM call the right tool? + * 2. Did LLM call the tool with correct and required parameters? + * + * The number can be one of: + * - 0: When LLM: + * - did not call the right tool + * - did not call the tool with correct parameters + * - 0.75: When LLM: + * - called the right tool but hallucinated and called some extra tools as + * well or called the same tool but with different parameters + * - called the right tool but hallucinated and called it with some + * non-required parameters + * - 1: When LLM: + * - called exactly the tools that were expected + * - called the expected tools exactly with the expected parameters + * + * To calculate this number we must have: + * 1. a list of expected tool calls with their expected parameters + * 2. a list of LLM tool calls with their parameters + * + * For each expected tool call we find the best matching LLM tool call. Best + * matching LLM tool call will have: + * 1. the same name as that of the expected tool call + * 2. highest parameter similarity score, with at-least 0.75 to ensure an actual + * match. And in case of competing scores, we take the first one that appears + * in the LLM tool calls. + * + * Using the above logic we establish pairs between expected and actual tool + * calls. + * + * 1. If we could not pair some LLM tool calls with expected tool calls that + * means the LLM hallucinated over the extra tool calls. For that reason we + * will cap the maximum achievable accuracy to 0.75. + * + * 2. If we could not pair some expected tool calls with LLM tool calls that + * means the LLM did not call one of the expected tool required to solve the + * problem. For that reason we will mark the accuracy as 0 and exit early. + * + * 3. Now for each of the established tool call pairs, we will determine how + * correctly the parameters were called using the parameter similarity score. + * The parameter similarity score follow the same accuracy number pattern + * described above: + * - 0 : for missing parameters, incorrect parameter values + * - 0.75 : for additional parameters + * - 1 : for a perfect match + * + * The final accuracy score is then calculated as the least of: + * - Maximum achievable accuracy from #1 + * - The least of parameter similarity score from the established pairs in #3 + * + * For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts + */ +export function calculateToolCallingAccuracy( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: LLMToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return actualToolCalls.length === 0 ? 1 : 0.75; + } + + const maxAccuracy = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + + const individualAccuracies: number[] = []; + const checkedActualToolCallIndexes = new Set(); + + for (const expectedCall of expectedToolCalls) { + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: compareParams(expectedCall.parameters, call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score || a.index - b.index); + + const bestMatch = candidates[0]; + if (!bestMatch) { + individualAccuracies.push(0); + } else { + checkedActualToolCallIndexes.add(bestMatch.index); + const individualAccuracy = Math.min(bestMatch.score, maxAccuracy); + individualAccuracies.push(individualAccuracy); + } + } + + return Math.min(...individualAccuracies); +} + +function compareParams(expected: Record, actual: Record): number { + const differences = diff(expected, actual); + + if (differences.length === 0) { + return 1; + } + + const hasOnlyAdditions = differences.every((d) => d.type === "CREATE"); + const hasRemovals = differences.some((d) => d.type === "REMOVE"); + const hasChanges = differences.some((d) => d.type === "CHANGE"); + + if (hasOnlyAdditions && !hasRemovals && !hasChanges) { + return 0.75; + } + + return 0; +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts new file mode 100644 index 00000000..a919e8f0 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/disk-snapshot-storage.ts @@ -0,0 +1,117 @@ +import fs from "fs/promises"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; +import { GENERATED_ASSETS_DIR, LOCAL_SNAPSHOTS_FILE } from "../constants.js"; + +export class DiskSnapshotStorage implements AccuracySnapshotStorage { + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" + | "provider" + | "requestedModel" + | "prompt" + | "toolCallingAccuracy" + | "expectedToolCalls" + | "actualToolCalls" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + accuracyRunStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + }; + + await this.appendAccuracySnapshot(snapshotWithMeta); + } + + async getLatestSnapshotForCommit(commit: string): Promise { + const snapshot = await this.readSnapshot(); + const entries = snapshot + .filter((entry) => { + return entry.commitSHA === commit && entry.accuracyRunStatus === AccuracyRunStatus.Done; + }) + .sort((a, b) => b.createdOn - a.createdOn); + const latestRunId = entries[0]?.accuracyRunId; + return latestRunId ? snapshot.filter((entry) => entry.accuracyRunId === latestRunId) : []; + } + + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshot = await this.readSnapshot(); + return snapshot.filter((entry) => entry.accuracyRunId === accuracyRunId); + } + + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { + const snapshot = await this.readSnapshot(); + const updatedSnapshot = snapshot.map((entry) => { + if (entry.accuracyRunId === accuracyRunId) { + return { + ...entry, + accuracyRunStatus: status, + }; + } + + return entry; + }); + await this.writeSnapshot(updatedSnapshot); + } + + close(): Promise { + return Promise.resolve(); + } + + private async appendAccuracySnapshot(entry: AccuracySnapshotEntry): Promise { + for (let attempt = 0; attempt < 5; attempt++) { + try { + const snapshot = await this.readSnapshot(); + snapshot.unshift(entry); + await this.writeSnapshot(snapshot); + return; + } catch (e) { + if (attempt < 4) { + await this.waitFor(100 + Math.random() * 200); + } else { + throw e; + } + } + } + } + + private async writeSnapshot(snapshot: AccuracySnapshotEntry[]): Promise { + const tmp = `${LOCAL_SNAPSHOTS_FILE}~${Date.now()}`; + await fs.writeFile(tmp, JSON.stringify(snapshot, null, 2)); + await fs.rename(tmp, LOCAL_SNAPSHOTS_FILE); + } + + private async readSnapshot(): Promise { + try { + const raw = await fs.readFile(LOCAL_SNAPSHOTS_FILE, "utf8"); + return AccuracySnapshotEntrySchema.array().parse(JSON.parse(raw)); + } catch (e: unknown) { + if ((e as { code: string }).code === "ENOENT") { + return []; + } + throw e; + } + } + + private waitFor(ms: number) { + return new Promise((resolve) => setTimeout(resolve, ms)); + } + + static async getStorage() { + await fs.mkdir(GENERATED_ASSETS_DIR, { recursive: true }); + return new DiskSnapshotStorage(); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts new file mode 100644 index 00000000..da67aa60 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.ts @@ -0,0 +1,7 @@ +import { DiskSnapshotStorage } from "./disk-snapshot-storage.js"; +import { MongoDBSnapshotStorage } from "./mdb-snapshot-storage.js"; +import { AccuracySnapshotStorage } from "./snapshot-storage.js"; + +export async function getAccuracySnapshotStorage(): Promise { + return MongoDBSnapshotStorage.getStorage() ?? (await DiskSnapshotStorage.getStorage()); +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts new file mode 100644 index 00000000..960daffc --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/mdb-snapshot-storage.ts @@ -0,0 +1,96 @@ +import { Collection, MongoClient } from "mongodb"; +import { + AccuracyRunStatus, + AccuracyRunStatuses, + AccuracySnapshotEntry, + AccuracySnapshotEntrySchema, + AccuracySnapshotStorage, +} from "./snapshot-storage.js"; + +export class MongoDBSnapshotStorage implements AccuracySnapshotStorage { + private readonly client: MongoClient; + private readonly snapshotCollection: Collection; + private constructor({ + mongodbUrl, + database, + collection, + }: { + mongodbUrl: string; + database: string; + collection: string; + }) { + this.client = new MongoClient(mongodbUrl); + this.snapshotCollection = this.client.db(database).collection(collection); + } + + async createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" + | "provider" + | "requestedModel" + | "prompt" + | "toolCallingAccuracy" + | "expectedToolCalls" + | "actualToolCalls" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise { + const snapshotWithMeta: AccuracySnapshotEntry = { + ...snapshotEntry, + accuracyRunStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + }; + await this.snapshotCollection.insertOne(snapshotWithMeta); + } + + async getLatestSnapshotForCommit(commit: string): Promise { + const latestRunId = await this.getLatestAccuracyRunForCommit(commit); + return latestRunId ? this.getSnapshotForAccuracyRun(latestRunId) : []; + } + + async getSnapshotForAccuracyRun(accuracyRunId: string): Promise { + const snapshotEntries = await this.snapshotCollection.find({ accuracyRunId }).toArray(); + return AccuracySnapshotEntrySchema.array().parse(snapshotEntries); + } + + private async getLatestAccuracyRunForCommit(commit: string): Promise { + const document = await this.snapshotCollection.findOne( + { commitSHA: commit, accuracyRunStatus: AccuracyRunStatus.Done }, + { sort: { createdOn: -1 }, projection: { accuracyRunId: 1 } } + ); + + return document?.accuracyRunId ? `${document?.accuracyRunId}` : undefined; + } + + async updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses) { + await this.snapshotCollection.updateMany( + { accuracyRunId: accuracyRunId }, + { $set: { accuracyRunStatus: status } } + ); + } + + async close(): Promise { + await this.client.close(); + } + + static getStorage(): MongoDBSnapshotStorage | null { + const mongodbUrl = process.env.MDB_ACCURACY_MDB_URL; + const database = process.env.MDB_ACCURACY_MDB_DB; + const collection = process.env.MDB_ACCURACY_MDB_COLLECTION; + if (!mongodbUrl || !database || !collection) { + return null; + } + + return new MongoDBSnapshotStorage({ + mongodbUrl, + database, + collection, + }); + } +} diff --git a/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts new file mode 100644 index 00000000..e0a6966d --- /dev/null +++ b/tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.ts @@ -0,0 +1,127 @@ +import z from "zod"; + +const LLMToolCallSchema = z.object({ + toolCallId: z.string(), + toolName: z.string(), + parameters: z.record(z.string(), z.unknown()), +}); +export type LLMToolCall = z.infer; + +const ExpectedToolCallSchema = LLMToolCallSchema.omit({ toolCallId: true }); +export type ExpectedToolCall = z.infer; + +export const AccuracyRunStatus = { + Done: "done", + Failed: "failed", + InProgress: "in-progress", +} as const; + +export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; + +export const AccuracySnapshotEntrySchema = z.object({ + /** + * A unique id for each accuracy run. Should either be generated by the + * script triggering the accuracy run or provided via environment variables. + * */ + accuracyRunId: z.string(), + + /** + * Represents the status of accuracy run. Each test completion, during an + * accuracy run, is supposed to submit an accuracy snapshot entry with + * InProgress status which then later, after completion of accuracy run, is + * updated to either Done or Failed, depending on whether there were errors + * during the run or not. */ + accuracyRunStatus: z + .enum([AccuracyRunStatus.Done, AccuracyRunStatus.Failed, AccuracyRunStatus.InProgress]) + .default(AccuracyRunStatus.InProgress), + + /** Timestamp of when this snapshot entry was generated. */ + createdOn: z.number(), + + /** The commit SHA for which the accuracy run was triggered. */ + commitSHA: z.string(), + + /** The LLM provider providing the LLM APIs */ + provider: z.string(), + + /** The LLM which was requested to respond to our test prompts */ + requestedModel: z.string(), + + /** The actual prompt that was provided to LLM as test */ + prompt: z.string(), + + /** A number between 0 and 1, representing how accurately the expected tools + * were called by LLM when responding to the provided prompts. To know more + * about how this number is generated, check - toolCallingAccuracy.ts */ + toolCallingAccuracy: z.number(), + + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ + expectedToolCalls: ExpectedToolCallSchema.array(), + + /** + * A list of tools, along with their parameters, that were actually called + * by the LLM in test. */ + actualToolCalls: LLMToolCallSchema.array(), + + /** + * The total time taken by LLM to respond to our prompt. */ + llmResponseTime: z.number(), + + /** + * Token usage data, returned as part of LLM prompt response. */ + tokensUsage: z + .object({ + promptTokens: z.number().optional(), + completionTokens: z.number().optional(), + totalTokens: z.number().optional(), + }) + .optional(), + + /** + * The ID of the model that actually responded to our prompt request. */ + respondingModel: z.string(), + + /** + * The final response text generated by the LLM, in response to our prompt + * request. */ + text: z.string(), + + /** + * A list of messages, exchanged between LLM and our testing agent, in + * response to our prompt request. This is particularly helpful for + * debugging. */ + messages: z.array(z.record(z.string(), z.unknown())), +}); + +export type AccuracySnapshotEntry = z.infer; + +export interface AccuracySnapshotStorage { + createSnapshotEntry( + snapshotEntry: Pick< + AccuracySnapshotEntry, + | "accuracyRunId" + | "commitSHA" + | "provider" + | "requestedModel" + | "prompt" + | "toolCallingAccuracy" + | "expectedToolCalls" + | "actualToolCalls" + | "llmResponseTime" + | "tokensUsage" + | "respondingModel" + | "text" + | "messages" + > + ): Promise; + + getLatestSnapshotForCommit(commit: string): Promise; + + getSnapshotForAccuracyRun(accuracyRunId: string): Promise; + + updateAccuracyRunStatus(accuracyRunId: string, status: AccuracyRunStatuses): Promise; + + close(): Promise; +} diff --git a/tests/accuracy/sdk/accuracy-testing-client.ts b/tests/accuracy/sdk/accuracy-testing-client.ts new file mode 100644 index 00000000..d2486942 --- /dev/null +++ b/tests/accuracy/sdk/accuracy-testing-client.ts @@ -0,0 +1,93 @@ +import { v4 as uuid } from "uuid"; +import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; + +import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; +import { LLMToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; +export type MockedTools = Record; + +/** + * AccuracyTestingClient is a bridge between actual MCP client connected to our + * MCP server and our Tool calling agent. Its serves the following purposes: + * 1. Captures actual tools provided by our MCP server + * 2. Translates captured MCP tools to tool definitions that can be consumed by + * Tool Calling agent (Ref: `vercelTools`) + * 3. Allow dynamic mocking and resetting of mocks of individual tool calls. + * 4. Records and provides tool calls made by LLMs with their parameters. + */ +export class AccuracyTestingClient { + private mockedTools: MockedTools = {}; + private llmToolCalls: LLMToolCall[] = []; + + private constructor(private readonly vercelMCPClient: Awaited>) {} + + async close() { + await this.vercelMCPClient?.close(); + } + + async vercelTools() { + const vercelTools = (await this.vercelMCPClient?.tools()) ?? {}; + const rewrappedVercelTools: typeof vercelTools = {}; + for (const [toolName, tool] of Object.entries(vercelTools)) { + rewrappedVercelTools[toolName] = createVercelTool({ + ...tool, + execute: async (args, options) => { + this.llmToolCalls.push({ + toolCallId: uuid(), + toolName: toolName, + parameters: args as Record, + }); + try { + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } + + return await tool.execute(args, options); + } catch (error) { + // There are cases when LLM calls the tools incorrectly + // and the schema definition check fails. In production, + // the tool calling agents are deployed with this fail + // safe to allow LLM to course correct themselves. That + // is exactly what we do here as well. + return { + isError: true, + content: JSON.stringify(error), + }; + } + }, + }); + } + + return rewrappedVercelTools; + } + + getLLMToolCalls() { + return this.llmToolCalls; + } + + mockTools(mockedTools: MockedTools) { + this.mockedTools = mockedTools; + } + + resetForTests() { + this.mockTools({}); + this.llmToolCalls = []; + } + + static async initializeClient(mdbConnectionString: string) { + const clientTransport = new StdioClientTransport({ + command: process.execPath, + args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString], + }); + + const client = await createMCPClient({ + transport: clientTransport, + }); + + return new AccuracyTestingClient(client); + } +} diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts new file mode 100644 index 00000000..ee0b5f7f --- /dev/null +++ b/tests/accuracy/sdk/agent.ts @@ -0,0 +1,56 @@ +import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai"; +import { Model } from "./models.js"; + +const systemPrompt = [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119', + "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', +]; + +// These types are not exported by Vercel SDK so we derive them here to be +// re-used again. +export type VercelMCPClient = Awaited>; +export type VercelMCPClientTools = Awaited>; +export type VercelAgent = ReturnType; + +export interface VercelAgentPromptResult { + respondingModel: string; + tokensUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; + text: string; + messages: Record[]; +} + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK +export interface Agent { + prompt(prompt: string, model: Model, tools: Tools): Promise; +} + +export function getVercelToolCallingAgent( + requestedSystemPrompt?: string +): Agent, VercelMCPClientTools, VercelAgentPromptResult> { + return { + async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { + const result = await generateText({ + model: model.getModel(), + system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"), + prompt, + tools, + maxSteps: 100, + }); + return { + text: result.text, + messages: result.response.messages, + respondingModel: result.response.modelId, + tokensUsage: result.usage, + }; + }, + }; +} diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts new file mode 100644 index 00000000..0598b1a7 --- /dev/null +++ b/tests/accuracy/sdk/constants.ts @@ -0,0 +1,22 @@ +import path from "path"; +import { fileURLToPath } from "url"; + +const __dirname = fileURLToPath(import.meta.url); + +export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); + +export const DIST_DIR = path.join(ROOT_DIR, "dist"); + +export const RESOURCES_DIR = path.join(ROOT_DIR, "resources"); + +export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); + +export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); + +export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); + +export const LOCAL_SNAPSHOTS_FILE = path.join(GENERATED_ASSETS_DIR, "snapshots.json"); + +export const HTML_TESTS_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "tests-summary.html"); + +export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); diff --git a/tests/accuracy/sdk/describe-accuracy-tests.ts b/tests/accuracy/sdk/describe-accuracy-tests.ts new file mode 100644 index 00000000..2a358ce1 --- /dev/null +++ b/tests/accuracy/sdk/describe-accuracy-tests.ts @@ -0,0 +1,119 @@ +import { TestableModels } from "./models.js"; +import { calculateToolCallingAccuracy } from "./accuracy-scorer.js"; +import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; +import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; +import { AccuracyTestingClient, MockedTools } from "./accuracy-testing-client.js"; +import { getAccuracySnapshotStorage } from "./accuracy-snapshot-storage/get-snapshot-storage.js"; +import { AccuracySnapshotStorage, ExpectedToolCall } from "./accuracy-snapshot-storage/snapshot-storage.js"; +import { getCommitSHA } from "./git-info.js"; + +export interface AccuracyTestConfig { + /** The prompt to be provided to LLM for evaluation. */ + prompt: string; + + /** + * A list of tools and their parameters that we expect LLM to call based on + * how vague or detailed the prompt is. Ideally this should be a list of + * bare minimum and critical tool calls that are required to solve the + * problem mentioned in the prompt but because, for even a slightly vague + * prompt, LLM might decide to do additional confirmation by calling other + * tools, its fine to include those other tool calls as well to get a + * perfect 1 on the tool calling accuracy score. */ + expectedToolCalls: ExpectedToolCall[]; + + /** + * The additional system prompt to be appended to already injected system + * prompt. */ + systemPrompt?: string; + + /** + * A small hint appended to the actual prompt in test, which is supposed to + * hint LLM to assume that the MCP server is already connected so that it + * does not call the connect tool. + * By default it is assumed to be true */ + injectConnectedAssumption?: boolean; + + /** + * A map of tool names to their mocked implementation. When the mocked + * implementations are available, the testing client will prefer those over + * actual MCP tool calls. */ + mockedTools?: MockedTools; +} + +export function describeAccuracyTests(models: TestableModels, accuracyTestConfigs: AccuracyTestConfig[]) { + if (!process.env.MDB_ACCURACY_RUN_ID) { + throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); + } + + if (!models.length) { + throw new Error("No models available to test. Ensure that the API keys are properly setup!"); + } + + const eachModel = describe.each(models); + + eachModel(`$displayName`, function (model) { + const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`; + const mdbIntegration = setupMongoDBIntegrationTest(); + const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + + let commitSHA: string; + let accuracySnapshotStorage: AccuracySnapshotStorage; + let testMCPClient: AccuracyTestingClient; + let agent: VercelAgent; + + beforeAll(async () => { + const retrievedCommitSHA = await getCommitSHA(); + if (!retrievedCommitSHA) { + throw new Error("Could not derive commitSHA, exiting accuracy tests!"); + } + commitSHA = retrievedCommitSHA; + + accuracySnapshotStorage = await getAccuracySnapshotStorage(); + testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); + agent = getVercelToolCallingAgent(); + }); + + beforeEach(async () => { + await cleanupTestDatabases(mdbIntegration); + await populateTestData(); + testMCPClient.resetForTests(); + }); + + afterAll(async () => { + await accuracySnapshotStorage?.close(); + await testMCPClient?.close(); + }); + + const eachTest = it.each(accuracyTestConfigs); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools ?? {}); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = + testConfig.injectConnectedAssumption === false + ? testConfig.prompt + : [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" "); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); + + const llmToolCalls = testMCPClient.getLLMToolCalls(); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracySnapshotStorage.createSnapshotEntry({ + accuracyRunId, + commitSHA, + provider: model.provider, + requestedModel: model.modelName, + prompt: testConfig.prompt, + llmResponseTime: responseTime, + toolCallingAccuracy: toolCallingAccuracy, + actualToolCalls: llmToolCalls, + expectedToolCalls: testConfig.expectedToolCalls, + ...result, + }); + }); + }); +} diff --git a/tests/accuracy/sdk/git-info.ts b/tests/accuracy/sdk/git-info.ts new file mode 100644 index 00000000..a0918a6f --- /dev/null +++ b/tests/accuracy/sdk/git-info.ts @@ -0,0 +1,12 @@ +import { simpleGit } from "simple-git"; + +export async function getCommitSHA(): Promise { + const commitLogs = await simpleGit().log(); + const lastCommit = commitLogs.latest; + return lastCommit?.hash; +} + +export async function getMergeBase(targetBranch: string, workBranchOrCommit: string): Promise { + const result = await simpleGit().raw(["merge-base", targetBranch, workBranchOrCommit]); + return result.trim(); +} diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts new file mode 100644 index 00000000..9f47028f --- /dev/null +++ b/tests/accuracy/sdk/models.ts @@ -0,0 +1,97 @@ +import { LanguageModelV1 } from "ai"; +import { createGoogleGenerativeAI } from "@himanshusinghs/google"; +import { createAzure } from "@ai-sdk/azure"; +import { createOpenAI } from "@ai-sdk/openai"; +import { ollama } from "ollama-ai-provider"; + +export interface Model

{ + readonly modelName: string; + readonly provider: string; + readonly displayName: string; + isAvailable(): boolean; + getModel(): P; +} + +export class OpenAIModel implements Model { + readonly provider = "OpenAI"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_OPEN_AI_API_KEY; + } + + getModel() { + return createOpenAI({ + apiKey: process.env.MDB_OPEN_AI_API_KEY, + })(this.modelName); + } +} + +export class AzureOpenAIModel implements Model { + readonly provider = "Azure"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; + } + + getModel() { + return createAzure({ + baseURL: process.env.MDB_AZURE_OPEN_AI_API_URL, + apiKey: process.env.MDB_AZURE_OPEN_AI_API_KEY, + apiVersion: "2024-12-01-preview", + })(this.modelName); + } +} + +export class GeminiModel implements Model { + readonly provider = "Google"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getModel() { + return createGoogleGenerativeAI({ + apiKey: process.env.MDB_GEMINI_API_KEY, + })(this.modelName); + } +} + +export class OllamaModel implements Model { + readonly provider = "Ollama"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return true; + } + + getModel() { + return ollama(this.modelName); + } +} + +const ALL_TESTABLE_MODELS = [new AzureOpenAIModel("gpt-4o")]; + +export type TestableModels = ReturnType; + +export function getAvailableModels() { + return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable()); +} diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json new file mode 100644 index 00000000..f605f031 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -0,0 +1,417 @@ +[ + { + "_id": "fa53ead3-36f3-414c-9b3a-53aa9cf5038a", + "title": "Configurable dedicated project", + "publisher": "Dark Horse Comics", + "release_date": "2007-03-02T00:00:00", + "issues": 118, + "main_characters": ["Stephen Shaw"], + "genre": ["Sci-Fi"] + }, + { + "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", + "title": "Focused intangible service-desk", + "publisher": "Image Comics", + "release_date": "1998-12-07T00:00:00", + "issues": 137, + "main_characters": ["Margaret Hogan"], + "genre": ["Adventure", "Horror"] + }, + { + "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", + "title": "Expanded secondary system engine", + "publisher": "DC Comics", + "release_date": "2012-12-01T00:00:00", + "issues": 227, + "main_characters": ["Joseph Cook", "Tammy Bishop"], + "genre": ["Superhero"] + }, + { + "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", + "title": "Customizable zero-defect Graphic Interface", + "publisher": "DC Comics", + "release_date": "2011-02-24T00:00:00", + "issues": 270, + "main_characters": ["Sandra Moss"], + "genre": ["Fantasy"] + }, + { + "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", + "title": "Reduced eco-centric help-desk", + "publisher": "Dark Horse Comics", + "release_date": "2021-03-12T00:00:00", + "issues": 202, + "main_characters": [ + "Margaret Hogan", + "Angelica Stein", + "Tammy Murphy", + "Larry Hensley" + ], + "genre": ["Adventure", "Horror"] + }, + { + "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", + "title": "Triple-buffered dedicated help-desk", + "publisher": "Image Comics", + "release_date": "1964-09-20T00:00:00", + "issues": 36, + "main_characters": [ + "Richard Cooper", + "James Sanchez", + "Micheal Brown", + "Jeremy Rice" + ], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", + "title": "Operative logistical secured line", + "publisher": "Marvel Comics", + "release_date": "2007-11-19T00:00:00", + "issues": 55, + "main_characters": ["Joseph Bowman", "Robert Logan", "Ashley Watkins"], + "genre": ["Sci-Fi", "Horror"] + }, + { + "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", + "title": "Multi-lateral multi-state framework", + "publisher": "IDW Publishing", + "release_date": "2011-09-14T00:00:00", + "issues": 250, + "main_characters": [ + "Ashley Watkins", + "Virginia Watts", + "Lindsay Anderson", + "Scott Garcia" + ], + "genre": ["Action", "Horror"] + }, + { + "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", + "title": "Re-engineered encompassing standardization", + "publisher": "Marvel Comics", + "release_date": "1987-04-16T00:00:00", + "issues": 235, + "main_characters": ["Julie Goodwin"], + "genre": ["Sci-Fi"] + }, + { + "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", + "title": "Fully-configurable local success", + "publisher": "Dark Horse Comics", + "release_date": "1979-09-13T00:00:00", + "issues": 239, + "main_characters": ["Chad Pham", "Lindsay Anderson", "Carlos Burton"], + "genre": ["Adventure"] + }, + { + "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", + "title": "Realigned zero-defect capability", + "publisher": "Marvel Comics", + "release_date": "2023-10-01T00:00:00", + "issues": 163, + "main_characters": ["Kevin Humphrey", "Maria Wright", "Virginia Watts"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", + "title": "Sharable bottom-line frame", + "publisher": "IDW Publishing", + "release_date": "2016-09-28T00:00:00", + "issues": 14, + "main_characters": ["Brian Vincent"], + "genre": ["Sci-Fi", "Fantasy"] + }, + { + "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", + "title": "Centralized next generation middleware", + "publisher": "Image Comics", + "release_date": "1970-04-16T00:00:00", + "issues": 5, + "main_characters": ["Joseph Cook"], + "genre": ["Fantasy"] + }, + { + "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", + "title": "Re-engineered heuristic array", + "publisher": "IDW Publishing", + "release_date": "2019-02-15T00:00:00", + "issues": 121, + "main_characters": ["Angelica Stein", "Benjamin Morris", "Jeremy Rice"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", + "title": "Programmable transitional collaboration", + "publisher": "DC Comics", + "release_date": "1999-08-10T00:00:00", + "issues": 235, + "main_characters": [ + "Joseph Cook", + "Cynthia Brown", + "Carlos Burton", + "Micheal Brown" + ], + "genre": ["Adventure"] + }, + { + "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", + "title": "Object-based dynamic knowledgebase", + "publisher": "Image Comics", + "release_date": "1993-02-24T00:00:00", + "issues": 189, + "main_characters": [ + "Cristian Oneal", + "Brian Vincent", + "Holly Green", + "James Sanchez" + ], + "genre": ["Sci-Fi", "Fantasy"] + }, + { + "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", + "title": "Enhanced asynchronous matrices", + "publisher": "DC Comics", + "release_date": "2001-03-01T00:00:00", + "issues": 176, + "main_characters": ["Justin Martinez", "Tammy Murphy"], + "genre": ["Action", "Fantasy"] + }, + { + "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", + "title": "Synergized maximized artificial intelligence", + "publisher": "DC Comics", + "release_date": "1976-09-05T00:00:00", + "issues": 68, + "main_characters": ["Christopher Elliott", "Maria Wright"], + "genre": ["Superhero", "Adventure"] + }, + { + "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", + "title": "Switchable bottom-line complexity", + "publisher": "Marvel Comics", + "release_date": "2012-08-12T00:00:00", + "issues": 156, + "main_characters": [ + "Lindsay Anderson", + "Virginia Watts", + "Robert Logan", + "Margaret Hogan" + ], + "genre": ["Adventure"] + }, + { + "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", + "title": "Triple-buffered impactful customer loyalty", + "publisher": "Marvel Comics", + "release_date": "1976-09-18T00:00:00", + "issues": 275, + "main_characters": ["Sandra Moss", "Charles Blair", "Justin Martinez"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "da5be16e-13e8-42d5-8954-bd89919395af", + "title": "Programmable 24/7 website", + "publisher": "DC Comics", + "release_date": "2023-11-06T00:00:00", + "issues": 278, + "main_characters": [ + "Luis Callahan", + "Carlos Burton", + "Cristian Oneal", + "Michelle Valdez" + ], + "genre": ["Horror", "Fantasy"] + }, + { + "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", + "title": "Advanced incremental framework", + "publisher": "Image Comics", + "release_date": "2008-07-21T00:00:00", + "issues": 109, + "main_characters": ["Holly Green", "Diana Mata", "Julie Goodwin"], + "genre": ["Horror", "Sci-Fi"] + }, + { + "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", + "title": "Front-line coherent system engine", + "publisher": "DC Comics", + "release_date": "2012-04-27T00:00:00", + "issues": 297, + "main_characters": ["Joshua Hicks"], + "genre": ["Action", "Horror"] + }, + { + "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", + "title": "Progressive systematic superstructure", + "publisher": "Image Comics", + "release_date": "1996-02-20T00:00:00", + "issues": 295, + "main_characters": ["Margaret Hogan", "Christopher Elliott", "Joseph Cook"], + "genre": ["Fantasy", "Adventure"] + }, + { + "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", + "title": "Implemented national help-desk", + "publisher": "DC Comics", + "release_date": "2015-05-11T00:00:00", + "issues": 257, + "main_characters": [ + "Lindsay Anderson", + "James Sanchez", + "Julie Goodwin", + "Charles Blair" + ], + "genre": ["Action"] + }, + { + "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", + "title": "Down-sized impactful workforce", + "publisher": "IDW Publishing", + "release_date": "2024-06-19T00:00:00", + "issues": 259, + "main_characters": ["Debbie Green"], + "genre": ["Sci-Fi", "Superhero"] + }, + { + "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", + "title": "Re-engineered leadingedge structure", + "publisher": "DC Comics", + "release_date": "2011-04-14T00:00:00", + "issues": 282, + "main_characters": [ + "Larry Hensley", + "Joseph Cook", + "Brian Vincent", + "Sandra Moss" + ], + "genre": ["Adventure"] + }, + { + "_id": "71b845f3-4416-430a-81eb-8c208f824365", + "title": "Cloned 3rdgeneration contingency", + "publisher": "Dark Horse Comics", + "release_date": "2002-07-11T00:00:00", + "issues": 238, + "main_characters": [ + "Larry Hensley", + "Margaret Hogan", + "Holly Green", + "Joseph Bowman" + ], + "genre": ["Superhero", "Fantasy"] + }, + { + "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", + "title": "Secured zero tolerance monitoring", + "publisher": "DC Comics", + "release_date": "1969-11-30T00:00:00", + "issues": 104, + "main_characters": ["Micheal Brown"], + "genre": ["Horror", "Superhero"] + }, + { + "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", + "title": "Automated bifurcated access", + "publisher": "Image Comics", + "release_date": "1990-01-24T00:00:00", + "issues": 74, + "main_characters": ["Robert Logan"], + "genre": ["Sci-Fi"] + }, + { + "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", + "title": "Universal high-level pricing structure", + "publisher": "DC Comics", + "release_date": "1971-04-21T00:00:00", + "issues": 135, + "main_characters": ["Jeremy Rice", "Elizabeth Robinson", "James Sanchez"], + "genre": ["Action", "Sci-Fi"] + }, + { + "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", + "title": "Reduced optimizing strategy", + "publisher": "Dark Horse Comics", + "release_date": "1984-06-24T00:00:00", + "issues": 111, + "main_characters": ["Joshua Hicks", "Jeremy Rice", "Micheal Brown"], + "genre": ["Fantasy", "Superhero"] + }, + { + "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", + "title": "Virtual non-volatile groupware", + "publisher": "DC Comics", + "release_date": "2013-05-22T00:00:00", + "issues": 13, + "main_characters": ["Luis Callahan", "Tammy Bishop", "Cynthia Brown"], + "genre": ["Action"] + }, + { + "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", + "title": "Horizontal disintermediate extranet", + "publisher": "DC Comics", + "release_date": "2021-12-03T00:00:00", + "issues": 129, + "main_characters": ["Margaret Hogan"], + "genre": ["Action"] + }, + { + "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", + "title": "Cross-platform discrete framework", + "publisher": "Dark Horse Comics", + "release_date": "2001-08-02T00:00:00", + "issues": 38, + "main_characters": ["James Sanchez", "Larry Hensley"], + "genre": ["Superhero"] + }, + { + "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", + "title": "Cross-platform regional info-mediaries", + "publisher": "Image Comics", + "release_date": "2005-03-30T00:00:00", + "issues": 150, + "main_characters": ["Carlos Burton"], + "genre": ["Superhero", "Fantasy"] + }, + { + "_id": "88904f06-50a6-44f1-bccc-f379a9788611", + "title": "Mandatory 6thgeneration secured line", + "publisher": "Image Comics", + "release_date": "2021-06-27T00:00:00", + "issues": 262, + "main_characters": ["Luis Callahan"], + "genre": ["Sci-Fi", "Superhero"] + }, + { + "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", + "title": "Exclusive interactive concept", + "publisher": "IDW Publishing", + "release_date": "1969-06-03T00:00:00", + "issues": 264, + "main_characters": ["Scott Garcia", "Joseph Bowman"], + "genre": ["Fantasy", "Superhero"] + }, + { + "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", + "title": "Focused intermediate methodology", + "publisher": "DC Comics", + "release_date": "2004-03-19T00:00:00", + "issues": 210, + "main_characters": [ + "Justin Martinez", + "Julie Goodwin", + "Benjamin Morris", + "Virginia Watts" + ], + "genre": ["Adventure", "Action"] + }, + { + "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", + "title": "Right-sized contextually-based toolset", + "publisher": "IDW Publishing", + "release_date": "2007-12-27T00:00:00", + "issues": 117, + "main_characters": ["Debbie Green", "Christopher Elliott", "Joshua Hicks"], + "genre": ["Sci-Fi", "Action"] + } +] diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json new file mode 100644 index 00000000..4a255f48 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -0,0 +1,402 @@ +[ + { + "_id": "d7047787-abea-40fa-b78e-939925fd3589", + "name": "Elizabeth Robinson", + "alias": "ashley62", + "powers": ["Shapeshifting", "Telepathy", "Flight"], + "first_appearance": "1961-06-23T00:00:00", + "affiliations": ["Fantastic Four", "X-Men"], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", + "name": "Joshua Wang", + "alias": "paulasmith", + "powers": ["Telekinesis"], + "first_appearance": "1987-04-16T00:00:00", + "affiliations": ["Fantastic Four", "Justice League"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", + "name": "Stephen Shaw", + "alias": "adamskenneth", + "powers": ["Super Speed", "Flight"], + "first_appearance": "2004-07-26T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", + "name": "Joseph Bowman", + "alias": "amysalazar", + "powers": ["Time Manipulation"], + "first_appearance": "1961-07-03T00:00:00", + "affiliations": ["Teen Titans", "Avengers"], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", + "name": "Debbie Green", + "alias": "steventodd", + "powers": ["Energy Blasts", "Regeneration"], + "first_appearance": "2021-12-05T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", + "name": "Christopher Elliott", + "alias": "barajasmitchell", + "powers": ["Flight", "Invisibility", "Telekinesis"], + "first_appearance": "1947-03-23T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", + "name": "Tammy Murphy", + "alias": "jessicagill", + "powers": ["Super Strength", "Telekinesis"], + "first_appearance": "2000-07-06T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", + "name": "Scott Garcia", + "alias": "whitechristie", + "powers": ["Telepathy", "Energy Blasts"], + "first_appearance": "2000-11-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", + "name": "Julie Goodwin", + "alias": "robertsmith", + "powers": ["Telepathy", "Super Speed"], + "first_appearance": "1953-08-09T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", + "name": "Joshua Hicks", + "alias": "cynthia32", + "powers": ["Super Strength", "Invisibility", "Telekinesis"], + "first_appearance": "1967-07-17T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", + "name": "Justin Martinez", + "alias": "janicebrown", + "powers": ["Super Speed", "Super Strength"], + "first_appearance": "1973-09-19T00:00:00", + "affiliations": ["Avengers"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", + "name": "Holly Green", + "alias": "ystanley", + "powers": ["Shapeshifting", "Energy Blasts"], + "first_appearance": "2013-08-05T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": true + }, + { + "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", + "name": "Margaret Hogan", + "alias": "wendyconway", + "powers": ["Super Speed", "Telepathy"], + "first_appearance": "1944-08-13T00:00:00", + "affiliations": ["Justice League", "X-Men"], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", + "name": "Ashley Watkins", + "alias": "cjohnson", + "powers": ["Shapeshifting"], + "first_appearance": "1940-09-13T00:00:00", + "affiliations": ["Fantastic Four", "Guardians of the Galaxy"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", + "name": "Tammy Bishop", + "alias": "geoffreyryan", + "powers": ["Regeneration"], + "first_appearance": "1984-11-04T00:00:00", + "affiliations": ["Fantastic Four", "X-Men"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", + "name": "Michelle Valdez", + "alias": "manuelcobb", + "powers": ["Regeneration", "Energy Blasts"], + "first_appearance": "2014-08-04T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", + "name": "Joseph Cook", + "alias": "scott40", + "powers": ["Telepathy", "Telekinesis"], + "first_appearance": "1976-04-01T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "0738b98f-4699-4609-9156-fb6a1085a503", + "name": "Jeremy Rice", + "alias": "james82", + "powers": ["Invisibility"], + "first_appearance": "1977-09-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", + "name": "Chad Pham", + "alias": "smithjennifer", + "powers": ["Telepathy"], + "first_appearance": "2001-05-26T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "d545ec48-680c-4493-8650-d759bedabb7e", + "name": "Diana Mata", + "alias": "zwilliamson", + "powers": ["Super Speed", "Energy Blasts", "Invisibility"], + "first_appearance": "2010-11-21T00:00:00", + "affiliations": [], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", + "name": "Maria Wright", + "alias": "yraymond", + "powers": ["Flight", "Telepathy"], + "first_appearance": "1971-04-15T00:00:00", + "affiliations": ["Avengers", "Teen Titans"], + "origin": "Asgard", + "is_villain": true + }, + { + "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", + "name": "Carlos Burton", + "alias": "rperkins", + "powers": ["Super Speed", "Time Manipulation", "Telekinesis"], + "first_appearance": "1970-01-20T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", + "name": "Lindsay Anderson", + "alias": "amycox", + "powers": ["Super Strength", "Telekinesis"], + "first_appearance": "1976-04-30T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", + "name": "Larry Hensley", + "alias": "ylester", + "powers": ["Super Strength", "Invisibility", "Shapeshifting"], + "first_appearance": "2019-01-21T00:00:00", + "affiliations": ["Guardians of the Galaxy", "Avengers"], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", + "name": "Sandra Moss", + "alias": "alexandra81", + "powers": ["Telekinesis", "Super Speed"], + "first_appearance": "1989-07-28T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", + "name": "Cynthia Brown", + "alias": "freed", + "powers": ["Super Strength", "Energy Blasts"], + "first_appearance": "2015-06-19T00:00:00", + "affiliations": ["Fantastic Four"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", + "name": "Brian Vincent", + "alias": "ghowell", + "powers": ["Invisibility", "Flight", "Super Speed"], + "first_appearance": "2012-05-12T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", + "name": "Kevin Humphrey", + "alias": "mary44", + "powers": ["Super Strength", "Super Speed", "Telepathy"], + "first_appearance": "1993-05-10T00:00:00", + "affiliations": ["Justice League", "Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", + "name": "Luis Callahan", + "alias": "ashleyreeves", + "powers": ["Telekinesis"], + "first_appearance": "1943-11-02T00:00:00", + "affiliations": ["X-Men"], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", + "name": "Micheal Brown", + "alias": "lisa85", + "powers": ["Telepathy", "Flight", "Time Manipulation"], + "first_appearance": "1983-11-04T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", + "name": "James Sanchez", + "alias": "mary95", + "powers": ["Energy Blasts", "Telekinesis"], + "first_appearance": "1999-05-20T00:00:00", + "affiliations": ["Justice League"], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", + "name": "Richard Cooper", + "alias": "james85", + "powers": ["Telekinesis", "Energy Blasts", "Super Speed"], + "first_appearance": "2021-11-27T00:00:00", + "affiliations": ["Justice League", "Fantastic Four"], + "origin": "Mars", + "is_villain": true + }, + { + "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", + "name": "Charles Blair", + "alias": "barbara60", + "powers": ["Super Strength"], + "first_appearance": "2012-05-03T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", + "name": "Virginia Watts", + "alias": "klane", + "powers": ["Telekinesis"], + "first_appearance": "2016-04-27T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "495f64a9-123e-46d4-9ddb-21692353a849", + "name": "Robert Logan", + "alias": "griffinsean", + "powers": ["Telepathy"], + "first_appearance": "2003-07-16T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", + "name": "Cheyenne Powell", + "alias": "laurenolsen", + "powers": ["Time Manipulation", "Energy Blasts"], + "first_appearance": "1964-02-05T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", + "name": "Benjamin Morris", + "alias": "sierra18", + "powers": ["Telekinesis", "Regeneration", "Shapeshifting"], + "first_appearance": "1964-09-27T00:00:00", + "affiliations": ["X-Men", "Avengers"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", + "name": "Cristian Oneal", + "alias": "harrellamy", + "powers": ["Super Speed"], + "first_appearance": "1965-01-29T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", + "name": "Jessica Vargas", + "alias": "chadherrera", + "powers": ["Energy Blasts", "Super Strength", "Telekinesis"], + "first_appearance": "1974-03-29T00:00:00", + "affiliations": ["X-Men", "Teen Titans"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "f3fa712d-2124-433a-b405-c02757fa1503", + "name": "Angelica Stein", + "alias": "reedjason", + "powers": ["Invisibility"], + "first_appearance": "1981-01-02T00:00:00", + "affiliations": ["Avengers"], + "origin": "Earth", + "is_villain": true + } +] diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json new file mode 100644 index 00000000..3c492185 --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -0,0 +1,496 @@ +[ + { + "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", + "title": "Human sell", + "release_year": 1993, + "genres": ["Sci-Fi"], + "director": "Christina Collins", + "cast": ["Jeremy Marks", "Matthew Moore", "Erica Miller", "Beth Morales"], + "runtime": 139, + "rating": 9.3 + }, + { + "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", + "title": "Trial we much", + "release_year": 2020, + "genres": ["Horror", "Comedy"], + "director": "Steven Miles", + "cast": [ + "Patrick Huynh", + "Darrell Thompson", + "Lindsay Thompson", + "Brandi Cooper" + ], + "runtime": 149, + "rating": 5.0 + }, + { + "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", + "title": "Someone", + "release_year": 1996, + "genres": ["Action", "Horror"], + "director": "Steven Miles", + "cast": [ + "Carrie Cummings", + "Patricia Rice", + "Suzanne Collins", + "April Murray", + "Kimberly Shaw" + ], + "runtime": 153, + "rating": 2.6 + }, + { + "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", + "title": "Without our", + "release_year": 2012, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": [ + "Rodney Gray", + "Mr. Joseph Allen", + "Heather Robles", + "Eric Edwards", + "James Wilson" + ], + "runtime": 143, + "rating": 9.1 + }, + { + "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", + "title": "Cost anything", + "release_year": 2002, + "genres": ["Romance", "Action"], + "director": "Bryan Andrews", + "cast": ["Gregory Mullins", "Jillian Arroyo", "Angela Reed"], + "runtime": 112, + "rating": 3.8 + }, + { + "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", + "title": "Hold green energy their", + "release_year": 1989, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": [ + "Eduardo Carey", + "Jodi Miller", + "Ronald Johnson", + "Lindsay Hernandez" + ], + "runtime": 126, + "rating": 7.4 + }, + { + "_id": "1b81c45b-1d09-47dc-871f-ace109107446", + "title": "Choose ability start", + "release_year": 1990, + "genres": ["Drama", "Comedy"], + "director": "Bryan Andrews", + "cast": [ + "Tyler Daniels", + "Gregory Harris", + "Whitney Swanson", + "Pamela Ramirez" + ], + "runtime": 141, + "rating": 5.6 + }, + { + "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", + "title": "Cover perhaps", + "release_year": 2022, + "genres": ["Drama"], + "director": "Daniel Wallace", + "cast": ["Victoria Price", "Holly Ross", "Michele Jones"], + "runtime": 173, + "rating": 4.3 + }, + { + "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", + "title": "Policy particularly", + "release_year": 2003, + "genres": ["Comedy"], + "director": "Brittany Parker", + "cast": ["Emily Haynes", "Crystal Johnson", "Ernest Jones"], + "runtime": 154, + "rating": 6.6 + }, + { + "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", + "title": "Store care", + "release_year": 2017, + "genres": ["Romance", "Sci-Fi"], + "director": "Sara Stewart", + "cast": [ + "Katherine Matthews", + "Stacey Wolf", + "Laurie Blackwell", + "Luis Ortiz", + "Christopher Vasquez" + ], + "runtime": 168, + "rating": 7.7 + }, + { + "_id": "99e75e60-6466-4314-92c3-00c433a06600", + "title": "Section close bad", + "release_year": 2024, + "genres": ["Drama", "Comedy"], + "director": "Bryan Andrews", + "cast": [ + "Heather Marshall", + "Alexander Austin", + "Stephanie Villarreal MD", + "Ryan Marquez" + ], + "runtime": 180, + "rating": 7.7 + }, + { + "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", + "title": "Become stand", + "release_year": 2001, + "genres": ["Sci-Fi", "Thriller"], + "director": "Brian Martinez", + "cast": ["Robert Ross", "Kimberly Williamson", "Pam Wyatt"], + "runtime": 162, + "rating": 1.5 + }, + { + "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", + "title": "I case", + "release_year": 2012, + "genres": ["Drama", "Comedy"], + "director": "Brittany Parker", + "cast": [ + "Justin Davis", + "Karen Doyle", + "Daniel Jackson", + "Courtney Mcdonald" + ], + "runtime": 122, + "rating": 3.1 + }, + { + "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", + "title": "No organization style", + "release_year": 2013, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": ["Benjamin Whitney", "Joseph Bush", "Barbara Griffin"], + "runtime": 167, + "rating": 9.6 + }, + { + "_id": "15855c7b-ece2-4238-b995-57f6207509ea", + "title": "Computer garden", + "release_year": 2012, + "genres": ["Horror"], + "director": "Steven Miles", + "cast": ["Darlene Lee", "Tina Wang", "Nathan Mayo"], + "runtime": 146, + "rating": 6.5 + }, + { + "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", + "title": "Trip information feel", + "release_year": 2008, + "genres": ["Action", "Thriller"], + "director": "Brittany Parker", + "cast": ["Kelly Walsh", "Michael Rocha"], + "runtime": 148, + "rating": 9.8 + }, + { + "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", + "title": "It project low part", + "release_year": 1992, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": [ + "Sheena Murphy", + "Amanda Miller", + "Erica Curtis", + "Roger Jones", + "Andrew Simpson" + ], + "runtime": 161, + "rating": 2.4 + }, + { + "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", + "title": "Near attorney discuss", + "release_year": 1983, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": [ + "Chase Myers", + "Benjamin Kelly", + "Thomas Summers MD", + "Jessica Woods" + ], + "runtime": 174, + "rating": 9.5 + }, + { + "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", + "title": "Whether know", + "release_year": 2009, + "genres": ["Comedy", "Thriller"], + "director": "Bryan Andrews", + "cast": ["Amy Reed", "William Williams", "Steven Lawrence"], + "runtime": 134, + "rating": 9.6 + }, + { + "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", + "title": "Against place", + "release_year": 2017, + "genres": ["Drama", "Romance"], + "director": "Daniel Wallace", + "cast": [ + "Brittany Thompson", + "Clinton Bishop", + "Terri Meyer", + "Stacey Phillips", + "Alexander Hunt" + ], + "runtime": 152, + "rating": 5.0 + }, + { + "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", + "title": "Return yard", + "release_year": 1994, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": ["Mason Lara", "Taylor Salinas", "Tim Foster", "Erin Sharp"], + "runtime": 99, + "rating": 8.8 + }, + { + "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", + "title": "Certain fish", + "release_year": 2009, + "genres": ["Romance"], + "director": "Steven Miles", + "cast": [ + "Jonathan King", + "Caitlyn Costa DDS", + "Steve Davis", + "Perry Anderson" + ], + "runtime": 130, + "rating": 8.6 + }, + { + "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", + "title": "Agreement like program", + "release_year": 2004, + "genres": ["Sci-Fi"], + "director": "Daniel Jackson", + "cast": [ + "Ashley Green", + "Rebecca Osborne", + "Robert Williams", + "Breanna Dunn", + "Philip Vargas" + ], + "runtime": 110, + "rating": 8.1 + }, + { + "_id": "791688be-4358-45ab-956e-71fe3fd35d19", + "title": "Floor seven then", + "release_year": 2009, + "genres": ["Horror"], + "director": "Daniel Wallace", + "cast": ["Dustin Wright", "Crystal Young"], + "runtime": 143, + "rating": 4.8 + }, + { + "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", + "title": "Like rather paper", + "release_year": 2006, + "genres": ["Drama"], + "director": "Spencer Gillespie", + "cast": ["Sean Moyer", "James Edwards", "Tara Lee", "Robert Scott"], + "runtime": 175, + "rating": 9.1 + }, + { + "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", + "title": "Argue hospital", + "release_year": 1994, + "genres": ["Romance", "Sci-Fi"], + "director": "Amanda Young", + "cast": [ + "Carolyn Williams", + "Jasmin Sampson", + "Phillip Levy", + "Brenda Clark", + "Lauren Perry" + ], + "runtime": 149, + "rating": 9.5 + }, + { + "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", + "title": "Become after card", + "release_year": 1986, + "genres": ["Sci-Fi", "Horror"], + "director": "Brian Martinez", + "cast": ["Rhonda Ochoa", "Charlene Castillo"], + "runtime": 100, + "rating": 8.5 + }, + { + "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", + "title": "Born authority attention", + "release_year": 1994, + "genres": ["Romance"], + "director": "Brian Martinez", + "cast": ["Matthew Thomas", "Carly Perkins"], + "runtime": 131, + "rating": 4.9 + }, + { + "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", + "title": "Local seven media", + "release_year": 1998, + "genres": ["Sci-Fi", "Drama"], + "director": "Amanda Young", + "cast": ["Jessica Perez", "Larry Atkinson"], + "runtime": 95, + "rating": 2.0 + }, + { + "_id": "498597d2-3254-46ef-a800-f322a86fbd55", + "title": "Keep employee", + "release_year": 1981, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": ["Alexis Carlson", "Andrew Stewart"], + "runtime": 161, + "rating": 6.0 + }, + { + "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", + "title": "American question generation", + "release_year": 1986, + "genres": ["Romance"], + "director": "Daniel Jackson", + "cast": ["Troy Carter", "Peter Hernandez", "Christine Brown"], + "runtime": 176, + "rating": 8.0 + }, + { + "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", + "title": "Maintain out", + "release_year": 2000, + "genres": ["Sci-Fi", "Action"], + "director": "Brian Martinez", + "cast": ["Nancy Evans", "Michael Gill", "Justin Carroll"], + "runtime": 179, + "rating": 10.0 + }, + { + "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", + "title": "Ten box study", + "release_year": 2011, + "genres": ["Horror", "Romance"], + "director": "Steven Miles", + "cast": [ + "Mark Hicks", + "Michelle Dean", + "John Buchanan", + "Veronica Johnson" + ], + "runtime": 147, + "rating": 2.5 + }, + { + "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", + "title": "Production operation", + "release_year": 2014, + "genres": ["Horror", "Romance"], + "director": "Sara Stewart", + "cast": ["Ashley Mata", "Mark Kelly", "John West", "Harold Day"], + "runtime": 125, + "rating": 4.1 + }, + { + "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", + "title": "What language", + "release_year": 2004, + "genres": ["Sci-Fi"], + "director": "Sara Stewart", + "cast": [ + "Scott Mckenzie", + "Jason Lee", + "Nathan Gardner", + "Jamie Greene", + "Angela Garner" + ], + "runtime": 177, + "rating": 3.7 + }, + { + "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", + "title": "Up usually central", + "release_year": 2011, + "genres": ["Sci-Fi", "Comedy"], + "director": "Daniel Jackson", + "cast": ["Jennifer Carlson", "Jonathan Stewart DDS", "Amy Lester"], + "runtime": 159, + "rating": 5.6 + }, + { + "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", + "title": "For boy only", + "release_year": 1987, + "genres": ["Thriller", "Action"], + "director": "Sara Stewart", + "cast": ["Gene Smith", "Robert Osborne Jr.", "Laura Fox", "Alexis Lowe"], + "runtime": 95, + "rating": 3.6 + }, + { + "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", + "title": "Site win including your", + "release_year": 2008, + "genres": ["Sci-Fi"], + "director": "Spencer Gillespie", + "cast": [ + "John Williams", + "Jason Huang", + "Karen Klein", + "Gary Tran", + "Jessica Murphy" + ], + "runtime": 178, + "rating": 6.2 + }, + { + "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", + "title": "Sell huge hair", + "release_year": 1997, + "genres": ["Thriller", "Action"], + "director": "Bryan Andrews", + "cast": ["Thomas Johnson", "Ryan Morrow"], + "runtime": 157, + "rating": 4.4 + }, + { + "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", + "title": "Guy rest", + "release_year": 1997, + "genres": ["Sci-Fi", "Horror"], + "director": "Steven Miles", + "cast": ["Michael Fox", "Tyler Acosta", "Tracy Adams"], + "runtime": 122, + "rating": 7.8 + } +] diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json new file mode 100644 index 00000000..2edc7fa7 --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -0,0 +1,572 @@ +[ + { + "_id": "b586e37c-6b32-417d-a53c-2a4c1121b11b", + "title": "Object-based analyzing architecture", + "seasons": 8, + "episodes": 62, + "platform": "Amazon Prime", + "genres": ["Comedy"], + "cast": [ + "Roger Gomez", + "Sandra Williams", + "Matthew Rodriguez", + "Scott Brown", + "Kristie Horn", + "Nicole Avila" + ], + "start_year": 2014, + "end_year": null + }, + { + "_id": "c28471ea-336f-4060-9b18-0bbff3de6622", + "title": "Customer-focused encompassing architecture", + "seasons": 4, + "episodes": 108, + "platform": "Hulu", + "genres": ["Thriller"], + "cast": ["Joseph Holmes", "Patrick Smith", "Charles Delacruz"], + "start_year": 2001, + "end_year": null + }, + { + "_id": "93f0969b-2377-4531-9c4e-45d2593015cd", + "title": "User-centric background approach", + "seasons": 6, + "episodes": 49, + "platform": "HBO", + "genres": ["Comedy", "Documentary"], + "cast": [ + "Jason Castillo", + "Jessica Burke", + "Philip Lewis", + "Philip Goodman", + "Corey Lee" + ], + "start_year": 2016, + "end_year": 2018 + }, + { + "_id": "a0b76db0-99a1-49fe-a5ea-fe802a66bde9", + "title": "Networked directional budgetary management", + "seasons": 5, + "episodes": 23, + "platform": "Amazon Prime", + "genres": ["Comedy", "Thriller"], + "cast": ["Mark Allen", "Anthony Snyder", "Kimberly Jones"], + "start_year": 2002, + "end_year": null + }, + { + "_id": "fbdef9b9-1ad4-4a6b-a39a-2e0b90423cb5", + "title": "Enterprise-wide dynamic intranet", + "seasons": 1, + "episodes": 12, + "platform": "Amazon Prime", + "genres": ["Crime", "Documentary"], + "cast": ["Matthew Green", "Kelly Wright", "Tonya Sullivan", "Daniel Brown"], + "start_year": 2009, + "end_year": 2020 + }, + { + "_id": "db54ab5c-bf6b-48ea-8272-1b1a4a76b848", + "title": "Exclusive real-time access", + "seasons": 10, + "episodes": 76, + "platform": "Amazon Prime", + "genres": ["Drama"], + "cast": ["Stacey Shaw", "Zachary Steele", "Laurie Martinez"], + "start_year": 2011, + "end_year": 2020 + }, + { + "_id": "53869b62-c8c7-48b3-86c9-17c935b43ff6", + "title": "Persevering leadingedge application", + "seasons": 5, + "episodes": 73, + "platform": "HBO", + "genres": ["Thriller"], + "cast": ["Diane Boyd", "Anna Rubio", "Cheryl Fisher", "Tyler Villa"], + "start_year": 2008, + "end_year": 2020 + }, + { + "_id": "3be07c4d-5275-4181-b2f6-5b1a1e46aa7b", + "title": "Multi-lateral analyzing model", + "seasons": 2, + "episodes": 114, + "platform": "Amazon Prime", + "genres": ["Fantasy"], + "cast": [ + "Kathleen Marshall", + "Kimberly Quinn", + "Steven Parker", + "Adrienne Green", + "Justin Hughes", + "Jean Smith" + ], + "start_year": 2017, + "end_year": 2023 + }, + { + "_id": "50cb455b-5ec0-4e68-8601-43e58defb762", + "title": "User-centric tangible monitoring", + "seasons": 3, + "episodes": 55, + "platform": "Disney+", + "genres": ["Drama"], + "cast": [ + "Barbara Clark", + "Carolyn Scott", + "Timothy Reed", + "Cory Burton", + "Jacob Hill" + ], + "start_year": 2006, + "end_year": 2012 + }, + { + "_id": "bab2dba4-88bd-4b24-afce-8781eb280d53", + "title": "Persevering background monitoring", + "seasons": 4, + "episodes": 61, + "platform": "Amazon Prime", + "genres": ["Comedy", "Fantasy"], + "cast": ["Adam Lin", "Evan Smith", "Christine Howard", "Ruben Hopkins"], + "start_year": 2006, + "end_year": 2023 + }, + { + "_id": "518f2ad9-bb65-4228-8d4c-7a62b9f88599", + "title": "Cross-group intangible architecture", + "seasons": 1, + "episodes": 90, + "platform": "HBO", + "genres": ["Comedy"], + "cast": [ + "Eric Ryan", + "Ashley Ball", + "Douglas Barton", + "Brian Whitehead", + "Michael Greer" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "d5f9304d-567d-4335-b43c-ec4034d7009f", + "title": "Programmable bottom-line monitoring", + "seasons": 10, + "episodes": 69, + "platform": "Hulu", + "genres": ["Documentary", "Fantasy"], + "cast": [ + "Mrs. Olivia Booth", + "William Murphy", + "Patricia Payne", + "Lisa Estes", + "Jason Martin", + "Jeff Greene" + ], + "start_year": 2011, + "end_year": 2024 + }, + { + "_id": "27718a30-6e42-47ad-8adf-1533b9b8a419", + "title": "Multi-lateral multi-tasking contingency", + "seasons": 3, + "episodes": 89, + "platform": "Disney+", + "genres": ["Crime"], + "cast": ["Elizabeth Lambert", "Corey Hughes", "Melissa Stephens"], + "start_year": 2006, + "end_year": null + }, + { + "_id": "defc7620-3b4e-46ff-a949-bec1af753812", + "title": "Focused zero administration migration", + "seasons": 9, + "episodes": 73, + "platform": "Disney+", + "genres": ["Documentary", "Drama"], + "cast": ["Shane Richardson", "Lisa Cooper", "Samantha Perkins"], + "start_year": 2008, + "end_year": null + }, + { + "_id": "9d6781fb-d095-4a00-932d-3f1fac1b0049", + "title": "Horizontal methodical encoding", + "seasons": 8, + "episodes": 40, + "platform": "Netflix", + "genres": ["Crime"], + "cast": ["Patricia Barrett", "Scott Gonzalez", "Michaela Johnson"], + "start_year": 2006, + "end_year": null + }, + { + "_id": "ac19b1b1-2bf9-4093-83fa-60411aa3f80f", + "title": "Enterprise-wide analyzing product", + "seasons": 8, + "episodes": 61, + "platform": "Hulu", + "genres": ["Drama"], + "cast": ["Christie Waters", "Casey Allen", "Nicole Frank"], + "start_year": 2001, + "end_year": 2005 + }, + { + "_id": "2dfd2240-dc9f-439f-9e06-b1ec8de397bf", + "title": "Compatible well-modulated extranet", + "seasons": 10, + "episodes": 89, + "platform": "Hulu", + "genres": ["Drama"], + "cast": [ + "Pedro Butler", + "Christian Hall", + "Dawn Gregory", + "Shannon Russell", + "Omar Mullins", + "Ian Ramos" + ], + "start_year": 2012, + "end_year": 2013 + }, + { + "_id": "94db1534-7163-430e-83e3-6a75bc6aec0f", + "title": "User-centric tangible infrastructure", + "seasons": 5, + "episodes": 11, + "platform": "Hulu", + "genres": ["Drama"], + "cast": [ + "Deborah Garcia", + "Michelle Barajas", + "Melissa Reynolds", + "Douglas Wilson" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "65b2213f-a606-42d8-b845-0199ba2e9b82", + "title": "Inverse optimal circuit", + "seasons": 1, + "episodes": 29, + "platform": "Amazon Prime", + "genres": ["Fantasy", "Documentary"], + "cast": [ + "Grace Rodriguez", + "Alison Greene", + "Michael Allen", + "Steven Hayden" + ], + "start_year": 2013, + "end_year": null + }, + { + "_id": "5a8a2745-e57c-4086-aa09-84131f40149f", + "title": "Public-key discrete alliance", + "seasons": 9, + "episodes": 111, + "platform": "Disney+", + "genres": ["Documentary"], + "cast": [ + "Emily Irwin", + "Olivia Gibson", + "Jean Hernandez", + "Michael Cummings" + ], + "start_year": 2013, + "end_year": 2022 + }, + { + "_id": "51326558-2080-4615-a583-b4f2fbd15600", + "title": "Managed zero administration groupware", + "seasons": 8, + "episodes": 108, + "platform": "Hulu", + "genres": ["Drama", "Crime"], + "cast": [ + "Karen Phillips", + "Kelly Marsh", + "Daniel Hamilton", + "Abigail Smith" + ], + "start_year": 2018, + "end_year": 2019 + }, + { + "_id": "87a2cd5f-75ee-4650-b2a4-a56384c97137", + "title": "Reverse-engineered static initiative", + "seasons": 6, + "episodes": 66, + "platform": "Amazon Prime", + "genres": ["Crime", "Documentary"], + "cast": [ + "Bradley Chavez", + "Catherine Horn", + "Joseph Bryant", + "Tara Rodriguez" + ], + "start_year": 2003, + "end_year": 2006 + }, + { + "_id": "0f647458-d09f-4be8-b1dc-49be1ba1e104", + "title": "Fundamental tangible matrices", + "seasons": 9, + "episodes": 22, + "platform": "Hulu", + "genres": ["Drama"], + "cast": ["Eric Lee", "Patrick Estrada", "Kelsey Brown", "Jeffrey Lewis"], + "start_year": 2001, + "end_year": null + }, + { + "_id": "53d34237-0e86-4a5e-922b-0589c2e65458", + "title": "Self-enabling homogeneous infrastructure", + "seasons": 5, + "episodes": 35, + "platform": "Hulu", + "genres": ["Crime"], + "cast": [ + "Chad Torres", + "Mark Williams", + "Terry Mcguire", + "Kathleen Cantu", + "Harold Knapp" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "71cc1515-ba84-4df6-92db-55af3cfa91f0", + "title": "Horizontal web-enabled application", + "seasons": 2, + "episodes": 94, + "platform": "Netflix", + "genres": ["Thriller", "Fantasy"], + "cast": [ + "Catherine Davila", + "Jessica James", + "Cory Miller", + "Alexis Sanchez", + "Andrew Miller" + ], + "start_year": 2002, + "end_year": 2017 + }, + { + "_id": "200556f7-10c6-4414-83f7-24ef74bff12a", + "title": "User-friendly bi-directional data-warehouse", + "seasons": 2, + "episodes": 87, + "platform": "Hulu", + "genres": ["Drama", "Fantasy"], + "cast": [ + "Tiffany Brown", + "Christina Morales", + "Samuel Blake", + "Stephanie Johnson", + "Wesley Deleon" + ], + "start_year": 2020, + "end_year": null + }, + { + "_id": "613832c9-5307-4c80-9dde-3eab4e5aa770", + "title": "Pre-emptive leadingedge capacity", + "seasons": 5, + "episodes": 56, + "platform": "Netflix", + "genres": ["Comedy"], + "cast": ["James Durham", "Jessica Myers", "Rachel King"], + "start_year": 2005, + "end_year": null + }, + { + "_id": "f9cb1076-3eaf-41d2-84df-057d27c1a544", + "title": "Fundamental intangible contingency", + "seasons": 4, + "episodes": 99, + "platform": "Disney+", + "genres": ["Crime", "Fantasy"], + "cast": [ + "Robert Foster", + "Jill Barton", + "Kimberly Simmons", + "Tracey Gomez" + ], + "start_year": 2017, + "end_year": 2020 + }, + { + "_id": "f96b112f-943e-43cd-90f0-56725cfa7e59", + "title": "Diverse asymmetric forecast", + "seasons": 9, + "episodes": 24, + "platform": "Amazon Prime", + "genres": ["Drama", "Crime"], + "cast": [ + "Carl Johnson", + "Douglas Beck", + "Kevin Guerra", + "Taylor Wilson", + "Eric Jarvis", + "Sarah Charles MD" + ], + "start_year": 2007, + "end_year": null + }, + { + "_id": "78eb682f-a03d-4cbf-bbfc-0e899e5f50d0", + "title": "Profit-focused solution-oriented Graphical User Interface", + "seasons": 10, + "episodes": 117, + "platform": "HBO", + "genres": ["Crime", "Fantasy"], + "cast": ["Carol Miller", "Jennifer Bass", "Melanie Leblanc"], + "start_year": 2002, + "end_year": null + }, + { + "_id": "ebb6d3c9-3c98-4799-94bc-aadd0bf2974c", + "title": "Reduced leadingedge system engine", + "seasons": 1, + "episodes": 58, + "platform": "Hulu", + "genres": ["Crime", "Drama"], + "cast": [ + "James Warren", + "Kelly Carter", + "Sarah Jones", + "Aaron Castaneda", + "Katherine Manning" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "4ffd32a7-0bf4-4c95-a7c8-19002c2eb83c", + "title": "Switchable 24/7 website", + "seasons": 6, + "episodes": 71, + "platform": "Netflix", + "genres": ["Documentary"], + "cast": [ + "Sarah Brown", + "Patrick Beck", + "Angela Herrera MD", + "Steven Mcconnell" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "37267325-4337-4912-992f-a162f9014569", + "title": "Synergized asymmetric adapter", + "seasons": 4, + "episodes": 16, + "platform": "Hulu", + "genres": ["Fantasy"], + "cast": ["Gabrielle Meyer", "Madison Matthews", "Taylor Martinez"], + "start_year": 2010, + "end_year": null + }, + { + "_id": "ea2abd77-c7da-443e-89fd-6f410f5d697e", + "title": "Extended contextually-based customer loyalty", + "seasons": 1, + "episodes": 79, + "platform": "Hulu", + "genres": ["Fantasy"], + "cast": ["Michael Lewis", "Cassandra Hicks", "Sydney Garcia"], + "start_year": 2015, + "end_year": 2023 + }, + { + "_id": "b568dd56-c083-4431-a740-4f4b5f4e1b21", + "title": "Versatile grid-enabled application", + "seasons": 7, + "episodes": 82, + "platform": "Hulu", + "genres": ["Crime", "Fantasy"], + "cast": ["Keith Brown", "Annette Johnson", "Joseph Carroll", "Derek Lewis"], + "start_year": 2006, + "end_year": 2008 + }, + { + "_id": "b6f2e1c3-6915-4e02-b1c2-44b5bec8fd68", + "title": "Operative optimizing encryption", + "seasons": 2, + "episodes": 52, + "platform": "Amazon Prime", + "genres": ["Fantasy", "Drama"], + "cast": [ + "Garrett Mcgrath", + "Craig Jackson", + "Michael Sullivan", + "Andrew Boyer" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "51c225d5-aa67-4b14-aca5-33757cef6bf4", + "title": "Business-focused 24/7 collaboration", + "seasons": 1, + "episodes": 113, + "platform": "Netflix", + "genres": ["Thriller", "Comedy"], + "cast": ["Matthew Hill", "Andrew White", "Grant Young", "John Mathews"], + "start_year": 2015, + "end_year": 2020 + }, + { + "_id": "7465e69f-341e-4234-8ffb-400622442a40", + "title": "Organized bi-directional application", + "seasons": 3, + "episodes": 40, + "platform": "Netflix", + "genres": ["Comedy"], + "cast": [ + "Matthew Gordon", + "Mark Allen", + "Amanda Webb", + "Jeffrey Horton", + "Sheila Lewis", + "Marcus Gilbert" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "90570eac-f923-4c30-a5b0-661b28a8e4a5", + "title": "Configurable bottom-line success", + "seasons": 10, + "episodes": 106, + "platform": "HBO", + "genres": ["Fantasy", "Drama"], + "cast": [ + "Elizabeth Taylor", + "Melissa Mullins", + "Alan Nguyen", + "Carolyn Kidd", + "Michael Pope" + ], + "start_year": 2015, + "end_year": null + }, + { + "_id": "06d70791-5487-4dab-8b84-a91b3376e396", + "title": "Organic dedicated analyzer", + "seasons": 3, + "episodes": 88, + "platform": "HBO", + "genres": ["Thriller", "Drama"], + "cast": ["Amy Aguilar", "James Williams", "Kevin Kirby"], + "start_year": 2010, + "end_year": 2025 + } +] diff --git a/tests/accuracy/update-many.test.ts b/tests/accuracy/update-many.test.ts new file mode 100644 index 00000000..86f96705 --- /dev/null +++ b/tests/accuracy/update-many.test.ts @@ -0,0 +1,54 @@ +import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js"; +import { getAvailableModels } from "./sdk/models.js"; +import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js"; + +function callsUpdateManyWithEmptyFilters(prompt: string): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +function callsUpdateManyWithFilters(prompt: string, filter: Record): AccuracyTestConfig { + return { + prompt: prompt, + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + filter, + update: { + $set: { + new_field: 1, + }, + }, + }, + }, + ], + }; +} + +describeAccuracyTests(getAvailableModels(), [ + callsUpdateManyWithEmptyFilters( + "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1" + ), + callsUpdateManyWithFilters( + "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1", + { runtime: { $lt: 100 } } + ), +]); diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index 935b27db..8df9b059 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -2,12 +2,37 @@ import { MongoCluster } from "mongodb-runner"; import path from "path"; import { fileURLToPath } from "url"; import fs from "fs/promises"; -import { MongoClient, ObjectId } from "mongodb"; +import { Document, MongoClient, ObjectId } from "mongodb"; import { getResponseContent, IntegrationTest, setupIntegrationTest, defaultTestConfig } from "../../helpers.js"; import { UserConfig } from "../../../../src/config.js"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const testDataDumpPath = path.join(__dirname, "..", "..", "..", "accuracy", "test-data-dumps"); + +const testDataPaths = [ + { + db: "comics", + collection: "books", + path: path.join(testDataDumpPath, "comics.books.json"), + }, + { + db: "comics", + collection: "characters", + path: path.join(testDataDumpPath, "comics.characters.json"), + }, + { + db: "mflix", + collection: "movies", + path: path.join(testDataDumpPath, "mflix.movies.json"), + }, + { + db: "mflix", + collection: "shows", + path: path.join(testDataDumpPath, "mflix.shows.json"), + }, +]; + interface MongoDBIntegrationTest { mongoClient: () => MongoClient; connectionString: () => string; @@ -169,3 +194,41 @@ export function validateAutoConnectBehavior( }); }); } + +export function prepareTestData(integration: MongoDBIntegrationTest) { + const NON_TEST_DBS = ["admin", "config", "local"]; + const testData: { + db: string; + collection: string; + data: Document[]; + }[] = []; + + beforeAll(async () => { + for (const { db, collection, path } of testDataPaths) { + testData.push({ + db, + collection, + data: JSON.parse(await fs.readFile(path, "utf8")) as Document[], + }); + } + }); + + return { + async populateTestData(this: void) { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).collection(collection).insertMany(data); + } + }, + async cleanupTestDatabases(this: void, integration: MongoDBIntegrationTest) { + const client = integration.mongoClient(); + const admin = client.db().admin(); + const databases = await admin.listDatabases(); + await Promise.all( + databases.databases + .filter(({ name }) => !NON_TEST_DBS.includes(name)) + .map(({ name }) => client.db(name).dropDatabase()) + ); + }, + }; +} diff --git a/tests/unit/accuracy-scorer.test.ts b/tests/unit/accuracy-scorer.test.ts new file mode 100644 index 00000000..60a389d7 --- /dev/null +++ b/tests/unit/accuracy-scorer.test.ts @@ -0,0 +1,199 @@ +import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracy-scorer.js"; +import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js"; + +describe("calculateToolCallingAccuracy", () => { + describe("edge cases", () => { + it("should return 1 when both expected and actual are empty", () => { + const result = calculateToolCallingAccuracy([], []); + expect(result).toBe(1); + }); + + it("should return 0.75 when expected is empty but actual has tool calls", () => { + const actualToolCalls: LLMToolCall[] = [{ toolCallId: "1", toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy([], actualToolCalls); + expect(result).toBe(0.75); + }); + + it("should return 0 when expected has tool calls but actual is empty", () => { + const expectedToolCalls: ExpectedToolCall[] = [{ toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy(expectedToolCalls, []); + expect(result).toBe(0); + }); + }); + + describe("perfect matches", () => { + it("should return 1 for exact match with nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for exact match with multiple diverse tool calls", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolCallId: "3", toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + }); + + describe("additional parameters", () => { + it("should return 0.75 when tool call has additional nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + db: "test", + collection: "users", + filter: { status: "active", age: { $gte: 18 } }, + limit: 10, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing or incorrect parameters", () => { + it("should return 0 when tool call has missing nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when aggregate tool call has incorrect pipeline", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $lt: 50 } } }] }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + }); + + describe("additional tool calls", () => { + it("should cap accuracy at 0.75 when LLM calls extra tools", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { toolCallId: "2", toolName: "count", parameters: { db: "test", collection: "orders" } }, + { + toolCallId: "3", + toolName: "aggregate", + parameters: { + db: "test", + collection: "products", + pipeline: [{ $group: { _id: "$category", total: { $sum: 1 } } }], + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + + it("should cap accuracy at 0.75 when LLM calls same tool multiple times with variations", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + { toolCallId: "3", toolName: "find", parameters: { db: "test", collection: "users", limit: 10 } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing tool calls", () => { + it("should return 0 if any expected tool call was not called", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + // Missing the aggregate tool call + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); // One expected tool call was not called + }); + }); +});